10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
22 #include <Kokkos_ArithTraits.hpp>
23 #include <KokkosBatched_Util.hpp>
24 #include <KokkosBatched_Vector.hpp>
25 #include <KokkosBatched_Copy_Decl.hpp>
26 #include <KokkosBatched_Copy_Impl.hpp>
27 #include <KokkosBatched_AddRadial_Decl.hpp>
28 #include <KokkosBatched_AddRadial_Impl.hpp>
29 #include <KokkosBatched_SetIdentity_Decl.hpp>
30 #include <KokkosBatched_SetIdentity_Impl.hpp>
31 #include <KokkosBatched_Gemm_Decl.hpp>
32 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
33 #include <KokkosBatched_Gemm_Team_Impl.hpp>
34 #include <KokkosBatched_Gemv_Decl.hpp>
35 #include <KokkosBatched_Gemv_Team_Impl.hpp>
36 #include <KokkosBatched_Trsm_Decl.hpp>
37 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
38 #include <KokkosBatched_Trsm_Team_Impl.hpp>
39 #include <KokkosBatched_Trsv_Decl.hpp>
40 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
41 #include <KokkosBatched_Trsv_Team_Impl.hpp>
42 #include <KokkosBatched_LU_Decl.hpp>
43 #include <KokkosBatched_LU_Serial_Impl.hpp>
44 #include <KokkosBatched_LU_Team_Impl.hpp>
46 #include <KokkosBlas1_nrm1.hpp>
47 #include <KokkosBlas1_nrm2.hpp>
51 #include "Ifpack2_BlockHelper.hpp"
52 #include "Ifpack2_BlockComputeResidualVector.hpp"
59 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
60 #include "cuda_profiler_api.h"
65 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
73 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
77 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
80 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
81 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
85 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
89 namespace BlockTriDiContainerDetails {
91 namespace KB = KokkosBatched;
98 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
99 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
100 MemoryTraitsType::is_random_access |
103 template <
typename ViewType>
104 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
105 typename ViewType::array_layout,
106 typename ViewType::device_type,
107 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
108 template <
typename ViewType>
109 using Atomic = Kokkos::View<
typename ViewType::data_type,
110 typename ViewType::array_layout,
111 typename ViewType::device_type,
112 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
113 template <
typename ViewType>
114 using Const = Kokkos::View<
typename ViewType::const_data_type,
115 typename ViewType::array_layout,
116 typename ViewType::device_type,
117 typename ViewType::memory_traits>;
118 template <
typename ViewType>
119 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
121 template <
typename ViewType>
122 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
124 template <
typename ViewType>
125 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
126 typename ViewType::array_layout,
127 typename ViewType::device_type,
128 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
131 template <
typename ViewType>
132 using Scratch = Kokkos::View<
typename ViewType::data_type,
133 typename ViewType::array_layout,
134 typename ViewType::execution_space::scratch_memory_space,
135 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
141 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
146 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
147 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
148 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
150 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
151 { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
153 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
155 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
161 template<
typename MatrixType>
164 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
166 using tpetra_map_type =
typename impl_type::tpetra_map_type;
167 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
168 using tpetra_import_type =
typename impl_type::tpetra_import_type;
169 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
170 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
172 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
173 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
175 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
178 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
180 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
181 const auto src =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
182 const auto tgt =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
183 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
192 template<
typename MatrixType>
193 struct AsyncableImport {
201 #if !defined(HAVE_IFPACK2_MPI)
202 typedef int MPI_Request;
203 typedef int MPI_Comm;
205 using scalar_type =
typename impl_type::scalar_type;
209 static int isend(
const MPI_Comm comm,
const char* buf,
int count,
int dest,
int tag, MPI_Request* ireq) {
210 #ifdef HAVE_IFPACK2_MPI
212 int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
213 if (ireq == NULL) MPI_Request_free(&ureq);
220 static int irecv(
const MPI_Comm comm,
char* buf,
int count,
int src,
int tag, MPI_Request* ireq) {
221 #ifdef HAVE_IFPACK2_MPI
223 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
224 if (ireq == NULL) MPI_Request_free(&ureq);
231 static int waitany(
int count, MPI_Request* reqs,
int* index) {
232 #ifdef HAVE_IFPACK2_MPI
233 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
239 static int waitall(
int count, MPI_Request* reqs) {
240 #ifdef HAVE_IFPACK2_MPI
241 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
248 using tpetra_map_type =
typename impl_type::tpetra_map_type;
249 using tpetra_import_type =
typename impl_type::tpetra_import_type;
251 using local_ordinal_type =
typename impl_type::local_ordinal_type;
252 using global_ordinal_type =
typename impl_type::global_ordinal_type;
256 using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
257 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
259 using execution_space =
typename impl_type::execution_space;
260 using memory_space =
typename impl_type::memory_space;
261 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
263 using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
265 #if defined(KOKKOS_ENABLE_CUDA)
266 using impl_scalar_type_1d_view =
267 typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
268 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
269 Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
270 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
271 Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
272 # else // no experimental macros are defined
273 typename impl_type::impl_scalar_type_1d_view,
275 typename impl_type::impl_scalar_type_1d_view>::type;
277 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
279 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type*,Kokkos::HostSpace>;
280 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
281 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
283 #ifdef HAVE_IFPACK2_MPI
287 impl_scalar_type_2d_view_tpetra remote_multivector;
288 local_ordinal_type blocksize;
291 struct SendRecvPair {
296 SendRecvPair<int_1d_view_host> pids;
297 SendRecvPair<std::vector<MPI_Request> > reqs;
298 SendRecvPair<size_type_1d_view> offset;
299 SendRecvPair<size_type_1d_view_host> offset_host;
300 SendRecvPair<local_ordinal_type_1d_view> lids;
301 SendRecvPair<impl_scalar_type_1d_view> buffer;
302 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host;
304 local_ordinal_type_1d_view dm2cm;
306 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
307 using exec_instance_1d_std_vector = std::vector<execution_space>;
308 exec_instance_1d_std_vector exec_instances;
314 const size_type_1d_view &offs) {
316 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
317 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
320 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
321 const local_ordinal_type lens_size = lens_device.extent(0);
322 Kokkos::parallel_scan
323 (
"AsyncableImport::RangePolicy::setOffsetValues",
324 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
327 update += (i < lens_size ? lens_device[i] : 0);
332 const size_type_1d_view_host &offs) {
334 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
335 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
339 for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
340 offs(i) = offs(i-1) + lens[i-1];
345 void createMpiRequests(
const tpetra_import_type &
import) {
346 Tpetra::Distributor &distributor =
import.getDistributor();
349 const auto pids_from = distributor.getProcsFrom();
351 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(int)*pids.recv.extent(0));
353 const auto pids_to = distributor.getProcsTo();
355 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(int)*pids.send.extent(0));
358 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*
sizeof(MPI_Request));
359 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*
sizeof(MPI_Request));
363 const auto lengths_to = distributor.getLengthsTo();
366 const auto lengths_from = distributor.getLengthsFrom();
369 setOffsetValues(lengths_to, offset.send);
370 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
372 setOffsetValues(lengths_from, offset.recv);
373 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
375 const auto lengths_to = distributor.getLengthsTo();
376 offset_host.send = size_type_1d_view_host(
do_not_initialize_tag(
"offset send"), lengths_to.size() + 1);
378 const auto lengths_from = distributor.getLengthsFrom();
379 offset_host.recv = size_type_1d_view_host(
do_not_initialize_tag(
"offset recv"), lengths_from.size() + 1);
381 setOffsetValuesHost(lengths_to, offset_host.send);
384 setOffsetValuesHost(lengths_from, offset_host.recv);
389 void createSendRecvIDs(
const tpetra_import_type &
import) {
391 const auto remote_lids =
import.getRemoteLIDs();
392 const local_ordinal_type_1d_view_host
393 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
395 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
398 auto epids =
import.getExportPIDs();
399 auto elids =
import.getExportLIDs();
402 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
405 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
406 const auto pid_send_value = pids.send[i];
407 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
408 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
409 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
411 Kokkos::deep_copy(lids.send, lids_send_host);
414 void createExecutionSpaceInstances() {
415 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
418 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
425 struct ToMultiVector {};
429 const local_ordinal_type blocksize_,
430 const local_ordinal_type_1d_view dm2cm_) {
431 blocksize = blocksize_;
434 #ifdef HAVE_IFPACK2_MPI
435 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
437 const tpetra_import_type
import(src_map, tgt_map);
439 createMpiRequests(
import);
440 createSendRecvIDs(
import);
441 createExecutionSpaceInstances();
444 void createDataBuffer(
const local_ordinal_type &num_vectors) {
445 const size_type extent_0 = lids.recv.extent(0)*blocksize;
446 const size_type extent_1 = num_vectors;
447 if (remote_multivector.extent(0) == extent_0 &&
448 remote_multivector.extent(1) == extent_1) {
454 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
455 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
460 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
461 buffer_host.send = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer send"), send_buffer_size);
462 buffer_host.recv = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer recv"), recv_buffer_size);
468 #ifdef HAVE_IFPACK2_MPI
469 waitall(reqs.recv.size(), reqs.recv.data());
470 waitall(reqs.send.size(), reqs.send.data());
478 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
479 template<
typename PackTag>
481 void copy(
const local_ordinal_type_1d_view &lids_,
482 const impl_scalar_type_1d_view &buffer_,
483 const local_ordinal_type ibeg_,
484 const local_ordinal_type iend_,
485 const impl_scalar_type_2d_view_tpetra &multivector_,
486 const local_ordinal_type blocksize_,
487 const execution_space &exec_instance_) {
488 const local_ordinal_type num_vectors = multivector_.extent(1);
489 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
490 const local_ordinal_type idiff = iend_ - ibeg_;
491 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
493 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
494 local_ordinal_type vector_size(0);
495 if (blocksize_ <= 4) vector_size = 4;
496 else if (blocksize_ <= 8) vector_size = 8;
497 else if (blocksize_ <= 16) vector_size = 16;
498 else vector_size = 32;
500 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
501 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
504 Kokkos::Experimental::require(policy, work_item_property),
505 KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
506 const local_ordinal_type i = member.league_rank();
508 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
509 auto aptr = abase + blocksize_*(i + idiff*j);
510 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
511 if (std::is_same<PackTag,ToBuffer>::value)
513 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
518 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
525 void asyncSendRecvVar1(
const impl_scalar_type_2d_view_tpetra &mv) {
526 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
528 #ifdef HAVE_IFPACK2_MPI
530 const local_ordinal_type num_vectors = mv.extent(1);
531 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
534 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
535 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
537 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
538 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
545 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
546 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
554 execution_space().fence();
557 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
559 if (i<8) exec_instances[i%8].fence();
560 copy<ToBuffer>(lids.send, buffer.send,
561 offset_host.send(i), offset_host.send(i+1),
564 exec_instances[i%8]);
565 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
567 const local_ordinal_type num_vectors = mv.extent(1);
568 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
570 Kokkos::deep_copy(exec_instances[i%8],
571 Kokkos::subview(buffer_host.send,
572 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
573 offset_host.send(i)*mv_blocksize,
574 offset_host.send(i+1)*mv_blocksize)),
575 Kokkos::subview(buffer.send,
576 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
577 offset_host.send(i)*mv_blocksize,
578 offset_host.send(i+1)*mv_blocksize)));
583 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
585 if (i<8) exec_instances[i%8].fence();
586 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
588 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
589 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
596 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
597 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
605 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
608 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
610 #endif // HAVE_IFPACK2_MPI
611 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
614 void syncRecvVar1() {
615 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
616 #ifdef HAVE_IFPACK2_MPI
618 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
619 local_ordinal_type idx = i;
622 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
624 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
625 const local_ordinal_type num_vectors = remote_multivector.extent(1);
626 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
629 Kokkos::subview(buffer.recv,
630 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
631 offset_host.recv(idx)*mv_blocksize,
632 offset_host.recv(idx+1)*mv_blocksize)),
633 Kokkos::subview(buffer_host.recv,
634 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
635 offset_host.recv(idx)*mv_blocksize,
636 offset_host.recv(idx+1)*mv_blocksize)));
640 copy<ToMultiVector>(lids.recv, buffer.recv,
641 offset_host.recv(idx), offset_host.recv(idx+1),
642 remote_multivector, blocksize,
643 exec_instances[idx%8]);
650 waitall(reqs.send.size(), reqs.send.data());
651 #endif // HAVE_IFPACK2_MPI
652 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
654 #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
661 template<
typename PackTag>
663 void copy(
const local_ordinal_type_1d_view &lids_,
664 const impl_scalar_type_1d_view &buffer_,
665 const local_ordinal_type &ibeg_,
666 const local_ordinal_type &iend_,
667 const impl_scalar_type_2d_view_tpetra &multivector_,
668 const local_ordinal_type blocksize_) {
669 const local_ordinal_type num_vectors = multivector_.extent(1);
670 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
671 const local_ordinal_type idiff = iend_ - ibeg_;
672 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
673 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
674 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
675 local_ordinal_type vector_size(0);
676 if (blocksize_ <= 4) vector_size = 4;
677 else if (blocksize_ <= 8) vector_size = 8;
678 else if (blocksize_ <= 16) vector_size = 16;
679 else vector_size = 32;
680 const team_policy_type policy(idiff, 1, vector_size);
682 (
"AsyncableImport::TeamPolicy::copy",
683 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
684 const local_ordinal_type i = member.league_rank();
686 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
687 auto aptr = abase + blocksize_*(i + idiff*j);
688 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
689 if (std::is_same<PackTag,ToBuffer>::value)
691 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
696 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
702 const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
704 (
"AsyncableImport::RangePolicy::copy",
705 policy, KOKKOS_LAMBDA(
const local_ordinal_type &ij) {
706 const local_ordinal_type i = ij%idiff;
707 const local_ordinal_type j = ij/idiff;
708 auto aptr = abase + blocksize_*(i + idiff*j);
709 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
710 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
711 auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
712 memcpy(to, from,
sizeof(impl_scalar_type)*blocksize_);
721 void asyncSendRecvVar0(
const impl_scalar_type_2d_view_tpetra &mv) {
722 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
724 #ifdef HAVE_IFPACK2_MPI
726 const local_ordinal_type num_vectors = mv.extent(1);
727 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
730 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
731 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
733 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
734 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
741 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
742 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
750 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
751 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
754 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
756 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
757 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
764 Kokkos::subview(buffer_host.send,
765 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
766 offset_host.send(i)*mv_blocksize,
767 offset_host.send(i+1)*mv_blocksize)),
768 Kokkos::subview(buffer.send,
769 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
770 offset_host.send(i)*mv_blocksize,
771 offset_host.send(i+1)*mv_blocksize)));
773 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
774 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
783 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
786 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
789 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
792 void syncRecvVar0() {
793 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
794 #ifdef HAVE_IFPACK2_MPI
796 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
797 local_ordinal_type idx = i;
798 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
799 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
800 const local_ordinal_type num_vectors = remote_multivector.extent(1);
801 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
803 Kokkos::subview(buffer.recv,
804 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
805 offset_host.recv(idx)*mv_blocksize,
806 offset_host.recv(idx+1)*mv_blocksize)),
807 Kokkos::subview(buffer_host.recv,
808 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
809 offset_host.recv(idx)*mv_blocksize,
810 offset_host.recv(idx+1)*mv_blocksize)));
812 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
813 remote_multivector, blocksize);
816 waitall(reqs.send.size(), reqs.send.data());
818 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
824 void asyncSendRecv(
const impl_scalar_type_2d_view_tpetra &mv) {
825 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
826 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
827 asyncSendRecvVar1(mv);
829 asyncSendRecvVar0(mv);
832 asyncSendRecvVar0(mv);
836 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
837 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
847 void syncExchange(
const impl_scalar_type_2d_view_tpetra &mv) {
848 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
851 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
854 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
857 template <
typename ViewType1,
typename ViewType2>
858 struct are_same_struct {
862 are_same_struct(ViewType1 keys1_, ViewType2 keys2_) : keys1(keys1_), keys2(keys2_) {}
863 KOKKOS_INLINE_FUNCTION
864 void operator()(
int i,
unsigned int& count)
const {
865 if (keys1(i) != keys2(i)) count++;
869 template <
typename ViewType1,
typename ViewType2>
870 bool are_same (ViewType1 keys1, ViewType2 keys2) {
871 unsigned int are_same_ = 0;
873 Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
874 are_same_struct(keys1, keys2),
882 template<
typename MatrixType>
887 using tpetra_map_type =
typename impl_type::tpetra_map_type;
888 using local_ordinal_type =
typename impl_type::local_ordinal_type;
889 using global_ordinal_type =
typename impl_type::global_ordinal_type;
890 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
891 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
892 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
893 using global_indices_array_device_type = Kokkos::View<const global_ordinal_type*, typename tpetra_map_type::device_type>;
895 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
896 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
898 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
901 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
903 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
904 const auto domain_map = g.getDomainMap();
905 const auto column_map = g.getColMap();
907 std::vector<global_ordinal_type> gids;
909 Kokkos::Subview<global_indices_array_device_type, std::pair<int,int>> column_map_global_iD_last;
911 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
913 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
915 global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
916 global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
918 if(are_same(domain_map_global_iD, column_map_global_iD)) {
920 separate_remotes =
true;
921 need_owned_permutation =
false;
923 column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
924 std::pair<int,int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
928 for (
size_t i=0;i<column_map->getLocalNumElements();++i) {
929 const global_ordinal_type gid = column_map->getGlobalElement(i);
930 if (!domain_map->isNodeGlobalElement(gid)) {
933 }
else if (found_first) {
934 separate_remotes =
false;
937 if (!found_first && !need_owned_permutation &&
938 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
947 need_owned_permutation =
true;
951 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
954 if (separate_remotes) {
955 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
957 const auto parsimonious_col_map
958 = need_owned_permutation ?
959 Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())):
960 Teuchos::rcp(
new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
961 if (parsimonious_col_map->getGlobalNumElements() > 0) {
963 local_ordinal_type_1d_view dm2cm;
964 if (need_owned_permutation) {
965 dm2cm = local_ordinal_type_1d_view(
do_not_initialize_tag(
"dm2cm"), domain_map->getLocalNumElements());
966 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
967 for (
size_t i=0;i<domain_map->getLocalNumElements();++i)
968 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
969 Kokkos::deep_copy(dm2cm, dm2cm_host);
971 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
972 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
975 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
976 return Teuchos::null;
979 template<
typename local_ordinal_type>
980 local_ordinal_type costTRSM(
const local_ordinal_type block_size) {
981 return block_size*block_size;
984 template<
typename local_ordinal_type>
985 local_ordinal_type costGEMV(
const local_ordinal_type block_size) {
986 return 2*block_size*block_size;
989 template<
typename local_ordinal_type>
990 local_ordinal_type costTriDiagSolve(
const local_ordinal_type subline_length,
const local_ordinal_type block_size) {
991 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length-1) * costGEMV(block_size);
994 template<
typename local_ordinal_type>
995 local_ordinal_type costSolveSchur(
const local_ordinal_type num_parts,
996 const local_ordinal_type num_teams,
997 const local_ordinal_type line_length,
998 const local_ordinal_type block_size,
999 const local_ordinal_type n_subparts_per_part) {
1000 const local_ordinal_type subline_length = ceil(
double(line_length - (n_subparts_per_part-1) * 2) / n_subparts_per_part);
1001 if (subline_length < 1) {
1005 const local_ordinal_type p_n_lines = ceil(
double(num_parts)/num_teams);
1006 const local_ordinal_type p_n_sublines = ceil(
double(n_subparts_per_part)*num_parts/num_teams);
1007 const local_ordinal_type p_n_sublines_2 = ceil(
double(n_subparts_per_part-1)*num_parts/num_teams);
1009 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1010 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part-1)*2,block_size);
1011 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length,block_size);
1012 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1014 if (n_subparts_per_part == 1) {
1015 return p_costApplyAinv;
1017 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1020 template<
typename local_ordinal_type>
1021 local_ordinal_type getAutomaticNSubparts(
const local_ordinal_type num_parts,
1022 const local_ordinal_type num_teams,
1023 const local_ordinal_type line_length,
1024 const local_ordinal_type block_size) {
1025 local_ordinal_type n_subparts_per_part_0 = 1;
1026 local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
1027 local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0+1);
1028 while (flop_0 > flop_1) {
1030 flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0)+1);
1032 return n_subparts_per_part_0;
1035 template<
typename ArgActiveExecutionMemorySpace>
1036 struct SolveTridiagsDefaultModeAndAlgo;
1041 template<
typename MatrixType>
1042 BlockHelperDetails::PartInterface<MatrixType>
1044 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1046 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1049 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1050 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1051 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
1052 using size_type =
typename impl_type::size_type;
1054 auto bA = Teuchos::rcp_dynamic_cast<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1057 const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1058 constexpr
int vector_length = impl_type::vector_length;
1059 constexpr
int internal_vector_length = impl_type::internal_vector_length;
1061 const auto comm = A->getRowMap()->getComm();
1063 BlockHelperDetails::PartInterface<MatrixType> interf;
1065 const bool jacobi = partitions.size() == 0;
1066 const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1067 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1069 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1070 std::vector<size_idx_pair_type> partsz(nparts);
1073 for (local_ordinal_type i=0;i<nparts;++i)
1074 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1075 std::sort(partsz.begin(), partsz.end(),
1076 [] (
const size_idx_pair_type& x,
const size_idx_pair_type& y) {
1077 return x.first > y.first;
1081 local_ordinal_type n_subparts_per_part;
1082 if (n_subparts_per_part_in == -1) {
1085 using execution_space =
typename impl_type::execution_space;
1087 const int line_length = partsz[0].first;
1089 const local_ordinal_type team_size =
1090 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1091 recommended_team_size(blocksize, vector_length, internal_vector_length);
1093 const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1095 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1097 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1098 printf(
"Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1102 n_subparts_per_part = n_subparts_per_part_in;
1106 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1109 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
1111 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1112 local_ordinal_type nrows = 0;
1116 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1119 (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) <<
"The #rows implied by the local partition is not "
1120 <<
"the same as getLocalNumRows: " << nrows <<
" vs " << A_n_lclrows);
1124 std::vector<local_ordinal_type> p;
1126 interf.max_partsz = 1;
1127 interf.max_subpartsz = 0;
1128 interf.n_subparts_per_part = 1;
1129 interf.nparts = nparts;
1134 for (local_ordinal_type i=0;i<nparts;++i)
1135 p[i] = partsz[i].second;
1137 interf.max_partsz = partsz[0].first;
1139 constexpr local_ordinal_type connection_length = 2;
1140 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1141 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1143 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1144 interf.n_subparts_per_part = n_subparts_per_part;
1145 interf.nparts = nparts;
1151 interf.part2rowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0"), nparts + 1);
1152 interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
1155 interf.part2rowidx0_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1156 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1157 interf.rowidx2part_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"rowidx2part"), A_n_lclrows);
1159 interf.partptr_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"partptr_sub"), n_sub_parts_and_schur, 2);
1162 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1163 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1165 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1166 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1167 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1168 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1170 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1171 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1172 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1175 interf.row_contiguous =
true;
1177 part2rowidx0(0) = 0;
1178 part2packrowidx0(0) = 0;
1179 local_ordinal_type pack_nrows = 0;
1180 local_ordinal_type pack_nrows_sub = 0;
1182 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices (Jacobi)", Jacobi);
1183 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1184 constexpr local_ordinal_type ipnrows = 1;
1186 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1189 if (ip % vector_length == 0) pack_nrows = ipnrows;
1190 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1191 const local_ordinal_type offset = partptr(ip);
1192 for (local_ordinal_type i=0;i<ipnrows;++i) {
1193 const auto lcl_row = ip;
1195 BlockHelperDetails::get_msg_prefix(comm)
1196 <<
"partitions[" << p[ip] <<
"]["
1197 << i <<
"] = " << lcl_row
1198 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
1200 lclrow(offset+i) = lcl_row;
1201 rowidx2part(offset+i) = ip;
1202 if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1203 interf.row_contiguous =
false;
1205 partptr(ip+1) = offset + ipnrows;
1207 part2rowidx0_sub(0) = 0;
1208 partptr_sub(0, 0) = 0;
1210 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1211 constexpr local_ordinal_type ipnrows = 1;
1212 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1215 (full_line_length != ipnrows, std::logic_error,
1216 "In the part " << ip );
1218 constexpr local_ordinal_type connection_length = 2;
1220 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1222 (
true, std::logic_error,
1223 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1225 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1226 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1228 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1230 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1231 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1232 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1233 if (local_sub_ip != n_subparts_per_part-1) {
1234 if (local_sub_ip != 0) {
1235 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1238 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1240 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1241 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1242 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1244 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1245 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1247 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1248 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1249 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1253 if (local_sub_ip != 0) {
1254 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1257 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1259 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1261 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1263 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1264 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1270 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1271 std::cout <<
"partptr_sub = " << std::endl;
1272 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1273 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1274 std::cout << partptr_sub(i,j) <<
" ";
1276 std::cout << std::endl;
1278 std::cout <<
"partptr_sub end" << std::endl;
1282 local_ordinal_type npacks = ceil(
float(nparts)/vector_length);
1284 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1285 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1286 part2packrowidx0_sub(ip, 0) = 0;
1288 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1290 local_ordinal_type ip_min = ipack*vector_length;
1291 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1292 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1293 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1297 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1298 local_ordinal_type ip_min = ipack*vector_length;
1299 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1301 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1303 constexpr local_ordinal_type connection_length = 2;
1305 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1306 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1308 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1309 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1310 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1312 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1314 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1315 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1320 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1322 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1324 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices", indices);
1325 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1326 const auto* part = &partitions[p[ip]];
1327 const local_ordinal_type ipnrows = part->size();
1328 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1330 BlockHelperDetails::get_msg_prefix(comm)
1331 <<
"partition " << p[ip]
1332 <<
" is empty, which is not allowed.");
1334 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1337 if (ip % vector_length == 0) pack_nrows = ipnrows;
1338 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1339 const local_ordinal_type offset = partptr(ip);
1340 for (local_ordinal_type i=0;i<ipnrows;++i) {
1341 const auto lcl_row = (*part)[i];
1343 BlockHelperDetails::get_msg_prefix(comm)
1344 <<
"partitions[" << p[ip] <<
"]["
1345 << i <<
"] = " << lcl_row
1346 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
1348 lclrow(offset+i) = lcl_row;
1349 rowidx2part(offset+i) = ip;
1350 if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1351 interf.row_contiguous =
false;
1353 partptr(ip+1) = offset + ipnrows;
1355 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1356 printf(
"Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1357 printf(
"partptr(%d+1) = %d\n", ip, partptr(ip+1));
1361 part2rowidx0_sub(0) = 0;
1362 partptr_sub(0, 0) = 0;
1365 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1366 const auto* part = &partitions[p[ip]];
1367 const local_ordinal_type ipnrows = part->size();
1368 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1371 (full_line_length != ipnrows, std::logic_error,
1372 "In the part " << ip );
1374 constexpr local_ordinal_type connection_length = 2;
1376 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1378 (
true, std::logic_error,
1379 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1381 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1382 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1384 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1386 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1387 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1388 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1389 if (local_sub_ip != n_subparts_per_part-1) {
1390 if (local_sub_ip != 0) {
1391 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1394 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1396 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1397 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1398 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1400 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1401 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1403 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1404 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1405 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1409 if (local_sub_ip != 0) {
1410 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1413 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1415 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1417 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1419 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1420 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1427 local_ordinal_type npacks = ceil(
float(nparts)/vector_length);
1429 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1430 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1431 part2packrowidx0_sub(ip, 0) = 0;
1433 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1435 local_ordinal_type ip_min = ipack*vector_length;
1436 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1437 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1438 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1442 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1443 local_ordinal_type ip_min = ipack*vector_length;
1444 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1446 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1448 constexpr local_ordinal_type connection_length = 2;
1450 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1451 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1453 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1454 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1455 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1457 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1459 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1460 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1465 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1467 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1469 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1472 if (lclrow(0) != 0) interf.row_contiguous =
false;
1474 Kokkos::deep_copy(interf.partptr, partptr);
1475 Kokkos::deep_copy(interf.lclrow, lclrow);
1477 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1480 interf.part2rowidx0 = interf.partptr;
1481 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1483 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1484 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1487 IFPACK2_BLOCKHELPER_TIMER(
"Fill packptr", packptr0);
1488 local_ordinal_type npacks = ceil(
float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1490 for (local_ordinal_type ip=1;ip<=nparts;++ip)
1491 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1495 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1497 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1498 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1501 Kokkos::deep_copy(interf.packptr, packptr);
1503 local_ordinal_type npacks_per_subpart = ceil(
float(nparts)/vector_length);
1504 npacks = ceil(
float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1506 interf.packindices_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"packindices_sub"), npacks_per_subpart*n_subparts_per_part);
1507 interf.packindices_schur = local_ordinal_type_2d_view(
do_not_initialize_tag(
"packindices_schur"), npacks_per_subpart,n_subparts_per_part-1);
1509 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1510 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1514 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
1515 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1516 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1517 packindices_schur(local_pack_ip,local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1521 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1522 packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
1525 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1526 std::cout <<
"packindices_sub = " << std::endl;
1527 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1528 std::cout << packindices_sub(i) <<
" ";
1530 std::cout << std::endl;
1531 std::cout <<
"packindices_sub end" << std::endl;
1533 std::cout <<
"packindices_schur = " << std::endl;
1534 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1535 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1536 std::cout << packindices_schur(i,j) <<
" ";
1538 std::cout << std::endl;
1541 std::cout <<
"packindices_schur end" << std::endl;
1544 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1545 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1548 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1550 for (local_ordinal_type k=0;k<npacks + 1;++k)
1551 packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1553 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1554 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1556 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1564 template <
typename MatrixType>
1567 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1569 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1570 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1571 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1577 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1580 local_ordinal_type_1d_view A_colindsub;
1583 vector_type_3d_view values;
1586 vector_type_3d_view values_schur;
1588 vector_type_4d_view e_values;
1590 bool is_diagonal_only;
1596 template <
typename idx_type>
1597 static KOKKOS_FORCEINLINE_FUNCTION
1598 idx_type IndexToRow (
const idx_type& ind) {
return (ind + 1) / 3; }
1601 template <
typename idx_type>
1602 static KOKKOS_FORCEINLINE_FUNCTION
1603 idx_type RowToIndex (
const idx_type& row) {
return row > 0 ? 3*row - 1 : 0; }
1605 template <
typename idx_type>
1606 static KOKKOS_FORCEINLINE_FUNCTION
1607 idx_type NumBlocks (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows - 2 : 0; }
1609 template <
typename idx_type>
1610 static KOKKOS_FORCEINLINE_FUNCTION
1611 idx_type NumBlocksSchur (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows + 2 : 0; }
1618 template<
typename MatrixType>
1621 IFPACK2_BLOCKHELPER_TIMER(
"createBlockTridiags", createBlockTridiags0);
1623 using execution_space =
typename impl_type::execution_space;
1624 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1625 using size_type =
typename impl_type::size_type;
1626 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1628 constexpr
int vector_length = impl_type::vector_length;
1632 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1635 btdm.flat_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1636 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
1637 Kokkos::parallel_scan
1638 (
"createBlockTridiags::RangePolicy::flat_td_ptr",
1639 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1640 const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
1641 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1644 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1646 if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
1647 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
1648 if (local_subpartidx % 2 == 0)
1649 update += btdm.NumBlocks(nrows);
1651 update += btdm.NumBlocksSchur(nrows);
1655 const auto nblocks = Kokkos::create_mirror_view_and_copy
1656 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
1657 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
1661 if (vector_length == 1) {
1662 btdm.pack_td_ptr = btdm.flat_td_ptr;
1666 local_ordinal_type npacks_per_subpart = 0;
1667 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1668 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1669 for (local_ordinal_type ip=1;ip<=interf.nparts;++ip)
1670 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1671 ++npacks_per_subpart;
1673 btdm.pack_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1674 const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
1676 Kokkos::parallel_for
1677 (
"createBlockTridiags::RangePolicy::pack_td_ptr",
1678 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1679 for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
1680 const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
1681 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
1683 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1684 const local_ordinal_type partidx = parti%interf.nparts;
1686 for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
1687 btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
1693 btdm.pack_td_ptr_schur = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1695 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1696 constexpr local_ordinal_type connection_length = 2;
1698 host_pack_td_ptr_schur(0,0) = 0;
1699 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1700 if (i % vector_length == 0) {
1702 host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
1703 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
1704 host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
1708 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1709 host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
1714 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1716 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1717 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1718 std::cout <<
"flat_td_ptr = " << std::endl;
1719 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1720 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1721 std::cout << host_flat_td_ptr(i,j) <<
" ";
1723 std::cout << std::endl;
1725 std::cout <<
"flat_td_ptr end" << std::endl;
1727 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1729 std::cout <<
"pack_td_ptr = " << std::endl;
1730 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1731 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1732 std::cout << host_pack_td_ptr(i,j) <<
" ";
1734 std::cout << std::endl;
1736 std::cout <<
"pack_td_ptr end" << std::endl;
1739 std::cout <<
"pack_td_ptr_schur = " << std::endl;
1740 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1741 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1742 std::cout << host_pack_td_ptr_schur(i,j) <<
" ";
1744 std::cout << std::endl;
1746 std::cout <<
"pack_td_ptr_schur end" << std::endl;
1750 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1765 template<
typename MatrixType>
1767 setTridiagsToIdentity
1768 (
const BlockTridiags<MatrixType>& btdm,
1769 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1772 using execution_space =
typename impl_type::execution_space;
1773 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1774 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1776 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1777 const local_ordinal_type blocksize = btdm.values.extent(1);
1780 const int vector_length = impl_type::vector_length;
1781 const int internal_vector_length = impl_type::internal_vector_length;
1783 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
1784 using internal_vector_type =
typename impl_type::internal_vector_type;
1785 using internal_vector_type_4d_view =
1786 typename impl_type::internal_vector_type_4d_view;
1788 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1789 const internal_vector_type_4d_view values
1790 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1791 btdm.values.extent(0),
1792 btdm.values.extent(1),
1793 btdm.values.extent(2),
1794 vector_length/internal_vector_length);
1795 const local_ordinal_type vector_loop_size = values.extent(3);
1796 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1797 local_ordinal_type total_team_size(0);
1798 if (blocksize <= 5) total_team_size = 32;
1799 else if (blocksize <= 9) total_team_size = 64;
1800 else if (blocksize <= 12) total_team_size = 96;
1801 else if (blocksize <= 16) total_team_size = 128;
1802 else if (blocksize <= 20) total_team_size = 160;
1803 else total_team_size = 160;
1804 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1805 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1806 #elif defined(KOKKOS_ENABLE_HIP)
1811 local_ordinal_type total_team_size(0);
1812 if (blocksize <= 5) total_team_size = 32;
1813 else if (blocksize <= 9) total_team_size = 64;
1814 else if (blocksize <= 12) total_team_size = 96;
1815 else if (blocksize <= 16) total_team_size = 128;
1816 else if (blocksize <= 20) total_team_size = 160;
1817 else total_team_size = 160;
1818 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1819 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1820 #elif defined(KOKKOS_ENABLE_SYCL)
1822 local_ordinal_type total_team_size(0);
1823 if (blocksize <= 5) total_team_size = 32;
1824 else if (blocksize <= 9) total_team_size = 64;
1825 else if (blocksize <= 12) total_team_size = 96;
1826 else if (blocksize <= 16) total_team_size = 128;
1827 else if (blocksize <= 20) total_team_size = 160;
1828 else total_team_size = 160;
1829 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1830 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1833 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1835 Kokkos::parallel_for
1836 (
"setTridiagsToIdentity::TeamPolicy",
1837 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1838 const local_ordinal_type k = member.league_rank();
1839 const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
1840 const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
1842 const local_ordinal_type diff = iend - ibeg;
1843 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1844 const btdm_scalar_type one(1);
1845 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
1846 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](
const local_ordinal_type &ii) {
1847 const local_ordinal_type i = ibeg + ii*3;
1848 for (local_ordinal_type j=0;j<blocksize;++j) {
1849 values(i,j,j,v) = one;
1860 template<
typename MatrixType>
1863 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1864 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1867 const bool overlap_communication_and_computation) {
1868 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SymbolicPhase", SymbolicPhase);
1873 using host_execution_space =
typename impl_type::host_execution_space;
1875 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1876 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1877 using size_type =
typename impl_type::size_type;
1878 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1879 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1880 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1881 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1882 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
1883 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1885 constexpr
int vector_length = impl_type::vector_length;
1887 const auto comm = A->getRowMap()->getComm();
1889 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
1890 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
1892 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
1894 const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows()/g->getLocalNumRows();
1897 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1898 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1899 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1900 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1901 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1903 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1905 Kokkos::View<local_ordinal_type*,host_execution_space> col2row(
"col2row", A->getLocalNumCols());
1911 const auto rowmap = g->getRowMap();
1912 const auto colmap = g->getColMap();
1913 const auto dommap = g->getDomainMap();
1914 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1916 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1917 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1918 Kokkos::parallel_for
1919 (
"performSymbolicPhase::RangePolicy::col2row",
1920 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1921 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1923 if (dommap->isNodeGlobalElement(gid)) {
1924 const local_ordinal_type lc = colmap->getLocalElement(gid);
1925 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1927 BlockHelperDetails::get_msg_prefix(comm) <<
"GID " << gid
1928 <<
" gives an invalid local column.");
1938 const auto local_graph = g->getLocalGraphHost();
1939 const auto local_graph_rowptr = local_graph.row_map;
1940 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
1941 const auto local_graph_colidx = local_graph.entries;
1945 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
1947 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1948 Kokkos::parallel_for
1949 (
"performSymbolicPhase::RangePolicy::lclrow2idx",
1950 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1951 lclrow2idx[lclrow(i)] = i;
1957 typename sum_reducer_type::value_type sum_reducer_value;
1959 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1960 Kokkos::parallel_reduce
1963 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
typename sum_reducer_type::value_type &update) {
1965 const local_ordinal_type ri0 = lclrow2idx[lr];
1966 const local_ordinal_type pi0 = rowidx2part(ri0);
1967 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1968 const local_ordinal_type lc = local_graph_colidx(j);
1969 const local_ordinal_type lc2r = col2row[lc];
1970 bool incr_R =
false;
1972 if (lc2r == (local_ordinal_type) -1) {
1976 const local_ordinal_type ri = lclrow2idx[lc2r];
1977 const local_ordinal_type pi = rowidx2part(ri);
1985 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1991 if (lc < nrows) ++update.v[1];
1995 }, sum_reducer_type(sum_reducer_value));
1997 size_type D_nnz = sum_reducer_value.v[0];
1998 size_type R_nnz_owned = sum_reducer_value.v[1];
1999 size_type R_nnz_remote = sum_reducer_value.v[2];
2001 if (!overlap_communication_and_computation) {
2002 R_nnz_owned += R_nnz_remote;
2008 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2010 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
2011 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2013 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2017 const local_ordinal_type nparts = partptr.extent(0) - 1;
2020 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2021 Kokkos::parallel_for
2022 (
"performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2023 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
2024 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2025 local_ordinal_type offset = 0;
2026 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
2027 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2029 const local_ordinal_type lr0 = lclrow(ri0);
2030 const size_type j0 = local_graph_rowptr(lr0);
2031 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
2032 const local_ordinal_type lc = local_graph_colidx(j);
2033 const local_ordinal_type lc2r = col2row[lc];
2034 if (lc2r == (local_ordinal_type) -1)
continue;
2035 const local_ordinal_type ri = lclrow2idx[lc2r];
2036 const local_ordinal_type pi = rowidx2part(ri);
2037 if (pi != pi0)
continue;
2038 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
2039 const local_ordinal_type row_entry = j - j0;
2040 D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
2045 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2046 for (
size_t i=0;i<D_A_colindsub.extent(0);++i)
2049 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2053 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
2054 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2055 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
2057 if (interf.n_subparts_per_part > 1) {
2058 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
2059 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2060 btdm.values_schur = vector_type_3d_view(
"btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2063 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2069 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
2070 amd.A_colindsub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub"), R_nnz_owned);
2072 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2073 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2075 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2076 amd.A_colindsub_remote = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub_remote"), R_nnz_remote);
2078 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2079 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2082 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
2083 Kokkos::parallel_for
2084 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2085 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
2086 const local_ordinal_type ri0 = lclrow2idx[lr];
2087 const local_ordinal_type pi0 = rowidx2part(ri0);
2088 const size_type j0 = local_graph_rowptr(lr);
2089 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2090 const local_ordinal_type lc = local_graph_colidx(j);
2091 const local_ordinal_type lc2r = col2row[lc];
2092 if (lc2r != (local_ordinal_type) -1) {
2093 const local_ordinal_type ri = lclrow2idx[lc2r];
2094 const local_ordinal_type pi = rowidx2part(ri);
2095 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2100 if (!overlap_communication_and_computation || lc < nrows) {
2103 ++R_rowptr_remote(lr);
2112 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
2113 Kokkos::parallel_scan
2114 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2115 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
2116 update_type &update,
2117 const bool &
final) {
2119 val.v[0] = R_rowptr(lr);
2120 if (overlap_communication_and_computation)
2121 val.v[1] = R_rowptr_remote(lr);
2124 R_rowptr(lr) = update.v[0];
2125 if (overlap_communication_and_computation)
2126 R_rowptr_remote(lr) = update.v[1];
2129 const local_ordinal_type ri0 = lclrow2idx[lr];
2130 const local_ordinal_type pi0 = rowidx2part(ri0);
2132 size_type cnt_rowptr = R_rowptr(lr);
2133 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
2135 const size_type j0 = local_graph_rowptr(lr);
2136 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2137 const local_ordinal_type lc = local_graph_colidx(j);
2138 const local_ordinal_type lc2r = col2row[lc];
2139 if (lc2r != (local_ordinal_type) -1) {
2140 const local_ordinal_type ri = lclrow2idx[lc2r];
2141 const local_ordinal_type pi = rowidx2part(ri);
2142 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2145 const local_ordinal_type row_entry = j - j0;
2146 if (!overlap_communication_and_computation || lc < nrows)
2147 R_A_colindsub(cnt_rowptr++) = row_entry;
2149 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2157 Kokkos::deep_copy(amd.rowptr, R_rowptr);
2158 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2159 if (overlap_communication_and_computation) {
2161 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2162 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2166 if (hasBlockCrsMatrix)
2167 amd.tpetra_values = (
const_cast<block_crs_matrix_type*
>(A_bcrs.get())->getValuesDeviceNonConst());
2169 amd.tpetra_values = (
const_cast<crs_matrix_type*
>(A_crs.get()))->getLocalValuesDevice (Tpetra::Access::ReadWrite);
2175 if (interf.n_subparts_per_part > 1)
2176 btdm.e_values = vector_type_4d_view(
"btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2178 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2185 template<
typename ArgActiveExecutionMemorySpace>
2190 typedef KB::Mode::Serial mode_type;
2191 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2192 typedef KB::Algo::Level3::CompactMKL algo_type;
2194 typedef KB::Algo::Level3::Blocked algo_type;
2196 static int recommended_team_size(
const int ,
2204 #if defined(KOKKOS_ENABLE_CUDA)
2205 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
2206 const int vector_length,
2207 const int internal_vector_length) {
2208 const int vector_size = vector_length/internal_vector_length;
2209 int total_team_size(0);
2210 if (blksize <= 5) total_team_size = 32;
2211 else if (blksize <= 9) total_team_size = 32;
2212 else if (blksize <= 12) total_team_size = 96;
2213 else if (blksize <= 16) total_team_size = 128;
2214 else if (blksize <= 20) total_team_size = 160;
2215 else total_team_size = 160;
2216 return 2*total_team_size/vector_size;
2219 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2220 typedef KB::Mode::Team mode_type;
2221 typedef KB::Algo::Level3::Unblocked algo_type;
2222 static int recommended_team_size(
const int blksize,
2223 const int vector_length,
2224 const int internal_vector_length) {
2225 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2229 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2230 typedef KB::Mode::Team mode_type;
2231 typedef KB::Algo::Level3::Unblocked algo_type;
2232 static int recommended_team_size(
const int blksize,
2233 const int vector_length,
2234 const int internal_vector_length) {
2235 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2240 #if defined(KOKKOS_ENABLE_HIP)
2241 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(
const int blksize,
2242 const int vector_length,
2243 const int internal_vector_length) {
2244 const int vector_size = vector_length/internal_vector_length;
2245 int total_team_size(0);
2246 if (blksize <= 5) total_team_size = 32;
2247 else if (blksize <= 9) total_team_size = 32;
2248 else if (blksize <= 12) total_team_size = 96;
2249 else if (blksize <= 16) total_team_size = 128;
2250 else if (blksize <= 20) total_team_size = 160;
2251 else total_team_size = 160;
2252 return 2*total_team_size/vector_size;
2255 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2256 typedef KB::Mode::Team mode_type;
2257 typedef KB::Algo::Level3::Unblocked algo_type;
2258 static int recommended_team_size(
const int blksize,
2259 const int vector_length,
2260 const int internal_vector_length) {
2261 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2265 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2266 typedef KB::Mode::Team mode_type;
2267 typedef KB::Algo::Level3::Unblocked algo_type;
2268 static int recommended_team_size(
const int blksize,
2269 const int vector_length,
2270 const int internal_vector_length) {
2271 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2276 #if defined(KOKKOS_ENABLE_SYCL)
2277 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(
const int blksize,
2278 const int vector_length,
2279 const int internal_vector_length) {
2280 const int vector_size = vector_length/internal_vector_length;
2281 int total_team_size(0);
2282 if (blksize <= 5) total_team_size = 32;
2283 else if (blksize <= 9) total_team_size = 32;
2284 else if (blksize <= 12) total_team_size = 96;
2285 else if (blksize <= 16) total_team_size = 128;
2286 else if (blksize <= 20) total_team_size = 160;
2287 else total_team_size = 160;
2288 return 2*total_team_size/vector_size;
2291 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2292 typedef KB::Mode::Team mode_type;
2293 typedef KB::Algo::Level3::Unblocked algo_type;
2294 static int recommended_team_size(
const int blksize,
2295 const int vector_length,
2296 const int internal_vector_length) {
2297 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2301 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2302 typedef KB::Mode::Team mode_type;
2303 typedef KB::Algo::Level3::Unblocked algo_type;
2304 static int recommended_team_size(
const int blksize,
2305 const int vector_length,
2306 const int internal_vector_length) {
2307 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2312 template<
typename impl_type,
typename WWViewType>
2313 KOKKOS_INLINE_FUNCTION
2315 solveMultiVector(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2316 const typename impl_type::local_ordinal_type &,
2317 const typename impl_type::local_ordinal_type &i0,
2318 const typename impl_type::local_ordinal_type &r0,
2319 const typename impl_type::local_ordinal_type &nrows,
2320 const typename impl_type::local_ordinal_type &v,
2321 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2322 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2323 const WWViewType &WW,
2324 const bool skip_first_pass=
false) {
2325 using execution_space =
typename impl_type::execution_space;
2326 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2327 using member_type =
typename team_policy_type::member_type;
2328 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2330 typedef SolveTridiagsDefaultModeAndAlgo
2331 <
typename execution_space::memory_space> default_mode_and_algo_type;
2333 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2334 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2336 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2339 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2340 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2343 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2344 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2347 local_ordinal_type i = i0, r = r0;
2352 if (skip_first_pass) {
2355 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2356 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2357 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2358 KB::Trsm<member_type,
2359 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2360 default_mode_type,default_algo_type>
2361 ::invoke(member, one, A, X2);
2362 X1.assign_data( X2.data() );
2366 KB::Trsm<member_type,
2367 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2368 default_mode_type,default_algo_type>
2369 ::invoke(member, one, A, X1);
2370 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2371 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2372 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2373 member.team_barrier();
2374 KB::Gemm<member_type,
2375 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2376 default_mode_type,default_algo_type>
2377 ::invoke(member, -one, A, X1, one, X2);
2378 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2379 KB::Trsm<member_type,
2380 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2381 default_mode_type,default_algo_type>
2382 ::invoke(member, one, A, X2);
2383 X1.assign_data( X2.data() );
2388 KB::Trsm<member_type,
2389 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2390 default_mode_type,default_algo_type>
2391 ::invoke(member, one, A, X1);
2392 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2394 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2395 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2396 member.team_barrier();
2397 KB::Gemm<member_type,
2398 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2399 default_mode_type,default_algo_type>
2400 ::invoke(member, -one, A, X1, one, X2);
2402 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2403 KB::Trsm<member_type,
2404 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2405 default_mode_type,default_algo_type>
2406 ::invoke(member, one, A, X2);
2407 X1.assign_data( X2.data() );
2411 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2412 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2413 ::invoke(member, X1, W);
2414 member.team_barrier();
2415 KB::Gemm<member_type,
2416 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2417 default_mode_type,default_algo_type>
2418 ::invoke(member, one, A, W, zero, X1);
2423 template<
typename impl_type,
typename WWViewType,
typename XViewType>
2424 KOKKOS_INLINE_FUNCTION
2426 solveSingleVectorNew(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2427 const typename impl_type::local_ordinal_type &blocksize,
2428 const typename impl_type::local_ordinal_type &i0,
2429 const typename impl_type::local_ordinal_type &r0,
2430 const typename impl_type::local_ordinal_type &nrows,
2431 const typename impl_type::local_ordinal_type &v,
2432 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2433 const XViewType &X_internal_vector_values,
2434 const WWViewType &WW) {
2435 using execution_space =
typename impl_type::execution_space;
2438 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2440 typedef SolveTridiagsDefaultModeAndAlgo
2441 <
typename execution_space::memory_space> default_mode_and_algo_type;
2443 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2444 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2446 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2449 auto A = D_internal_vector_values.data();
2450 auto X = X_internal_vector_values.data();
2453 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2454 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2458 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2459 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
2460 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
2461 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2462 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
2471 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2472 (default_mode_type,default_algo_type,
2475 blocksize,blocksize,
2480 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2481 member.team_barrier();
2482 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2483 (default_mode_type,default_algo_type,
2485 blocksize, blocksize,
2487 A+2*astep, as0, as1,
2491 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2492 (default_mode_type,default_algo_type,
2495 blocksize,blocksize,
2497 A+3*astep, as0, as1,
2505 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2506 (default_mode_type,default_algo_type,
2509 blocksize, blocksize,
2514 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2516 member.team_barrier();
2517 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2518 (default_mode_type,default_algo_type,
2520 blocksize, blocksize,
2522 A+1*astep, as0, as1,
2526 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2527 (default_mode_type,default_algo_type,
2530 blocksize, blocksize,
2539 const local_ordinal_type ws0 = WW.stride_0();
2540 auto W = WW.data() + v;
2541 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2543 member, blocksize, X, xs0, W, ws0);
2544 member.team_barrier();
2545 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2546 (default_mode_type,default_algo_type,
2548 blocksize, blocksize,
2557 template<
typename local_ordinal_type,
typename ViewType>
2558 void writeBTDValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2559 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2560 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2561 std::ofstream myfile;
2562 myfile.open (fileName);
2564 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2565 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2566 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2567 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2569 const local_ordinal_type block_size = scalar_values.extent(1);
2571 const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
2572 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2574 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2576 myfile <<
"%%MatrixMarket matrix coordinate real general"<< std::endl;
2577 myfile <<
"%%nnz = " << nnz;
2578 myfile <<
" block size = " << block_size;
2579 myfile <<
" number of blocks = " << n_blocks;
2580 myfile <<
" number of parts = " << n_parts;
2581 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2582 myfile <<
" number of rows = " << n_rows ;
2583 myfile <<
" number of cols = " << n_rows;
2584 myfile <<
" number of packs = " << n_packs << std::endl;
2586 myfile << n_rows <<
" " << n_rows <<
" " << nnz << std::setprecision(9) << std::endl;
2588 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2589 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2590 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2591 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2592 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2593 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2594 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2596 if (i_block_in_part % 3 == 0) {
2597 current_row_offset = i_block_in_part/3 * block_size;
2598 current_col_offset = i_block_in_part/3 * block_size;
2600 else if (i_block_in_part % 3 == 1) {
2601 current_row_offset = (i_block_in_part-1)/3 * block_size;
2602 current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
2604 else if (i_block_in_part % 3 == 2) {
2605 current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
2606 current_col_offset = (i_block_in_part-2)/3 * block_size;
2608 current_row_offset += current_part_idx * n_rows_per_part;
2609 current_col_offset += current_part_idx * n_rows_per_part;
2610 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2611 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2612 current_row = current_row_offset + i_in_block + 1;
2613 current_col = current_col_offset + j_in_block + 1;
2614 myfile << current_row <<
" " << current_col <<
" " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2625 template<
typename local_ordinal_type,
typename ViewType>
2626 void write4DMultiVectorValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2627 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2628 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2629 std::ofstream myfile;
2630 myfile.open (fileName);
2632 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2633 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2634 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2636 const local_ordinal_type block_size = scalar_values.extent(1);
2637 const local_ordinal_type n_cols = scalar_values.extent(2);
2639 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2640 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2642 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2645 myfile <<
"%%MatrixMarket matrix array real general"<< std::endl;
2646 myfile <<
"%%block size = " << block_size;
2647 myfile <<
" number of blocks = " << n_blocks;
2648 myfile <<
" number of parts = " << n_parts;
2649 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2650 myfile <<
" number of rows = " << n_rows ;
2651 myfile <<
" number of cols = " << n_cols;
2652 myfile <<
" number of packs = " << n_packs << std::endl;
2654 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2656 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2657 (void) current_row_offset;
2658 (void) current_part_idx;
2659 for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {
2660 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2661 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2662 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2663 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2664 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2666 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2668 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2669 myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2679 template<
typename local_ordinal_type,
typename ViewType>
2680 void write5DMultiVectorValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2681 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2682 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2683 std::ofstream myfile;
2684 myfile.open (fileName);
2686 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2687 const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
2688 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2690 const local_ordinal_type block_size = scalar_values.extent(2);
2691 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2692 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2694 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2695 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2697 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2699 myfile <<
"%%MatrixMarket matrix array real general"<< std::endl;
2700 myfile <<
"%%block size = " << block_size;
2701 myfile <<
" number of blocks = " << n_blocks;
2702 myfile <<
" number of parts = " << n_parts;
2703 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2704 myfile <<
" number of rows = " << n_rows ;
2705 myfile <<
" number of cols = " << n_cols;
2706 myfile <<
" number of packs = " << n_packs << std::endl;
2708 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2710 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2711 (void) current_row_offset;
2712 (void) current_part_idx;
2713 for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
2714 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2715 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2716 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2717 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2718 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2719 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2721 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(1))
2723 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2724 myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2735 template<
typename local_ordinal_type,
typename member_type,
typename ViewType1,
typename ViewType2>
2736 KOKKOS_INLINE_FUNCTION
2738 copy3DView(
const member_type &member,
const ViewType1 &view1,
const ViewType2 &view2) {
2751 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2753 template<
typename MatrixType>
2754 struct ExtractAndFactorizeTridiags {
2756 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2758 using execution_space =
typename impl_type::execution_space;
2759 using memory_space =
typename impl_type::memory_space;
2761 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2764 using magnitude_type =
typename impl_type::magnitude_type;
2766 using row_matrix_type =
typename impl_type::tpetra_row_matrix_type;
2767 using crs_graph_type =
typename impl_type::tpetra_crs_graph_type;
2769 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2770 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
2771 using size_type_2d_view =
typename impl_type::size_type_2d_view;
2772 using impl_scalar_type_1d_view_tpetra =
typename impl_type::impl_scalar_type_1d_view_tpetra;
2774 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2775 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2776 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2777 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
2778 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
2779 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
2780 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
2781 using btdm_scalar_type_5d_view =
typename impl_type::btdm_scalar_type_5d_view;
2782 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2783 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2785 using internal_vector_type =
typename impl_type::internal_vector_type;
2786 static constexpr
int vector_length = impl_type::vector_length;
2787 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
2790 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2791 using member_type =
typename team_policy_type::member_type;
2795 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2796 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2797 const local_ordinal_type max_partsz;
2799 using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2800 ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2801 ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2802 ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2804 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2805 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2806 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2807 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2808 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2809 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2811 const local_ordinal_type blocksize, blocksize_square;
2813 const magnitude_type tiny;
2814 const local_ordinal_type vector_loop_size;
2815 const local_ordinal_type vector_length_value;
2817 bool hasBlockCrsMatrix;
2820 ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
2821 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2824 const magnitude_type& tiny_) :
2826 partptr(interf_.partptr),
2827 lclrow(interf_.lclrow),
2828 packptr(interf_.packptr),
2829 packindices_sub(interf_.packindices_sub),
2830 packptr_sub(interf_.packptr_sub),
2831 partptr_sub(interf_.partptr_sub),
2832 part2packrowidx0_sub(interf_.part2packrowidx0_sub),
2833 packindices_schur(interf_.packindices_schur),
2834 max_partsz(interf_.max_partsz),
2836 pack_td_ptr(btdm_.pack_td_ptr),
2837 flat_td_ptr(btdm_.flat_td_ptr),
2838 pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
2839 A_colindsub(btdm_.A_colindsub),
2840 internal_vector_values((internal_vector_type*)btdm_.values.data(),
2841 btdm_.values.extent(0),
2842 btdm_.values.extent(1),
2843 btdm_.values.extent(2),
2844 vector_length/internal_vector_length),
2845 internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
2846 btdm_.values_schur.extent(0),
2847 btdm_.values_schur.extent(1),
2848 btdm_.values_schur.extent(2),
2849 vector_length/internal_vector_length),
2850 e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
2851 btdm_.e_values.extent(0),
2852 btdm_.e_values.extent(1),
2853 btdm_.e_values.extent(2),
2854 btdm_.e_values.extent(3),
2855 vector_length/internal_vector_length),
2856 scalar_values((btdm_scalar_type*)btdm_.values.data(),
2857 btdm_.values.extent(0),
2858 btdm_.values.extent(1),
2859 btdm_.values.extent(2),
2861 scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
2862 btdm_.values_schur.extent(0),
2863 btdm_.values_schur.extent(1),
2864 btdm_.values_schur.extent(2),
2866 e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
2867 btdm_.e_values.extent(0),
2868 btdm_.e_values.extent(1),
2869 btdm_.e_values.extent(2),
2870 btdm_.e_values.extent(3),
2872 blocksize(btdm_.values.extent(1)),
2873 blocksize_square(blocksize*blocksize),
2876 vector_loop_size(vector_length/internal_vector_length),
2877 vector_length_value(vector_length) {
2878 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
2879 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
2881 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A_);
2882 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A_);
2884 hasBlockCrsMatrix = ! A_bcrs.is_null ();
2886 A_block_rowptr = G_->getLocalGraphDevice().row_map;
2887 if (hasBlockCrsMatrix) {
2888 A_values =
const_cast<block_crs_matrix_type*
>(A_bcrs.get())->getValuesDeviceNonConst();
2891 A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2892 A_values = A_crs->getLocalValuesDevice (Tpetra::Access::ReadOnly);
2898 KOKKOS_INLINE_FUNCTION
2900 extract(local_ordinal_type partidx,
2901 local_ordinal_type local_subpartidx,
2902 local_ordinal_type npacks)
const {
2903 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2904 printf(
"extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2906 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2907 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2908 local_ordinal_type kfs[vector_length] = {};
2909 local_ordinal_type ri0[vector_length] = {};
2910 local_ordinal_type nrows[vector_length] = {};
2912 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2913 kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
2914 ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
2915 nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
2916 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2917 printf(
"kfs[%d] = %d;\n", vi, kfs[vi]);
2918 printf(
"ri0[%d] = %d;\n", vi, ri0[vi]);
2919 printf(
"nrows[%d] = %d;\n", vi, nrows[vi]);
2922 local_ordinal_type tr_min = 0;
2923 local_ordinal_type tr_max = nrows[0];
2924 if (local_subpartidx % 2 == 1) {
2928 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2929 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2931 for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
2932 for (local_ordinal_type e=0;e<3;++e) {
2933 if (hasBlockCrsMatrix) {
2934 const impl_scalar_type* block[vector_length] = {};
2935 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2936 const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2938 block[vi] = &A_values(Aj*blocksize_square);
2940 const size_type pi = kps + j;
2941 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2942 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2945 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2946 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2947 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2948 auto& v = internal_vector_values(pi, ii, jj, 0);
2949 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2950 v[vi] =
static_cast<btdm_scalar_type
>(block[vi][idx]);
2956 const size_type pi = kps + j;
2958 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2959 const size_type Aj_c = A_colindsub(kfs[vi] + j);
2961 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2962 auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr)*blocksize + ii);
2964 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2965 scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c*blocksize + jj);
2971 if (nrows[0] == 1)
break;
2972 if (local_subpartidx % 2 == 0) {
2973 if (e == 1 && (tr == 0 || tr+1 == nrows[0]))
break;
2974 for (local_ordinal_type vi=1;vi<npacks;++vi) {
2975 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2982 if (e == 0 && (tr == -1 || tr == nrows[0]))
break;
2983 for (local_ordinal_type vi=1;vi<npacks;++vi) {
2984 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
2994 KOKKOS_INLINE_FUNCTION
2996 extract(
const member_type &member,
2997 const local_ordinal_type &partidxbeg,
2998 local_ordinal_type local_subpartidx,
2999 const local_ordinal_type &npacks,
3000 const local_ordinal_type &vbeg)
const {
3001 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3002 printf(
"extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3004 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3005 local_ordinal_type kfs_vals[internal_vector_length] = {};
3006 local_ordinal_type ri0_vals[internal_vector_length] = {};
3007 local_ordinal_type nrows_vals[internal_vector_length] = {};
3009 const size_type kps = pack_td_ptr(partidxbeg,local_subpartidx);
3010 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3011 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,local_subpartidx);
3012 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,0);
3013 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,1) - ri0_vals[vi];
3014 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3015 printf(
"kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3016 printf(
"ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3017 printf(
"nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3021 local_ordinal_type j_vals[internal_vector_length] = {};
3023 local_ordinal_type tr_min = 0;
3024 local_ordinal_type tr_max = nrows_vals[0];
3025 if (local_subpartidx % 2 == 1) {
3029 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3030 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3032 for (local_ordinal_type tr=tr_min;tr<tr_max;++tr) {
3033 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3034 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3035 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows+1)) {
3036 auto &j = j_vals[vi];
3037 const local_ordinal_type kfs = kfs_vals[vi];
3038 const local_ordinal_type ri0 = ri0_vals[vi];
3039 local_ordinal_type lbeg, lend;
3040 if (local_subpartidx % 2 == 0) {
3041 lbeg = (tr == tr_min ? 1 : 0);
3042 lend = (tr == nrows - 1 ? 2 : 3);
3051 else if (tr == nrows) {
3056 if (hasBlockCrsMatrix) {
3057 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3058 const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3059 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
3060 const size_type pi = kps + j;
3061 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3062 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3064 Kokkos::parallel_for
3065 (Kokkos::TeamThreadRange(member,blocksize),
3066 [&](
const local_ordinal_type &ii) {
3067 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3068 scalar_values(pi, ii, jj, v) =
static_cast<btdm_scalar_type
>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
3074 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3075 const size_type Aj_c = A_colindsub(kfs + j);
3076 const size_type pi = kps + j;
3077 Kokkos::parallel_for
3078 (Kokkos::TeamThreadRange(member,blocksize),
3079 [&](
const local_ordinal_type &ii) {
3080 auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr)*blocksize + ii);
3081 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3082 scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c*blocksize + jj);
3092 template<
typename AAViewType,
3093 typename WWViewType>
3094 KOKKOS_INLINE_FUNCTION
3096 factorize_subline(
const member_type &member,
3097 const local_ordinal_type &i0,
3098 const local_ordinal_type &nrows,
3099 const local_ordinal_type &v,
3100 const AAViewType &AA,
3101 const WWViewType &WW)
const {
3103 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3104 <
typename execution_space::memory_space> default_mode_and_algo_type;
3106 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3107 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3110 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3112 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3113 printf(
"i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3117 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3119 default_mode_type,KB::Algo::LU::Unblocked>
3120 ::invoke(member, A , tiny);
3125 local_ordinal_type i = i0;
3126 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
3127 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3128 printf(
"tr = %d, i = %d;\n", tr, i);
3130 B.assign_data( &AA(i+1,0,0,v) );
3131 KB::Trsm<member_type,
3132 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3133 default_mode_type,default_algo_type>
3134 ::invoke(member, one, A, B);
3135 C.assign_data( &AA(i+2,0,0,v) );
3136 KB::Trsm<member_type,
3137 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3138 default_mode_type,default_algo_type>
3139 ::invoke(member, one, A, C);
3140 A.assign_data( &AA(i+3,0,0,v) );
3142 member.team_barrier();
3143 KB::Gemm<member_type,
3144 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3145 default_mode_type,default_algo_type>
3146 ::invoke(member, -one, C, B, one, A);
3148 default_mode_type,KB::Algo::LU::Unblocked>
3149 ::invoke(member, A, tiny);
3153 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3154 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
3155 ::invoke(member, A, W);
3156 KB::SetIdentity<member_type,default_mode_type>
3157 ::invoke(member, A);
3158 member.team_barrier();
3159 KB::Trsm<member_type,
3160 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3161 default_mode_type,default_algo_type>
3162 ::invoke(member, one, W, A);
3163 KB::Trsm<member_type,
3164 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3165 default_mode_type,default_algo_type>
3166 ::invoke(member, one, W, A);
3172 struct ExtractAndFactorizeSubLineTag {};
3173 struct ExtractBCDTag {};
3174 struct ComputeETag {};
3175 struct ComputeSchurTag {};
3176 struct FactorizeSchurTag {};
3178 KOKKOS_INLINE_FUNCTION
3180 operator() (
const ExtractAndFactorizeSubLineTag &,
const member_type &member)
const {
3182 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3184 const local_ordinal_type subpartidx = packptr_sub(packidx);
3185 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3186 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3187 const local_ordinal_type partidx = subpartidx%n_parts;
3189 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3190 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3191 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3193 internal_vector_scratch_type_3d_view
3194 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
3196 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3197 printf(
"rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3198 printf(
"vector_loop_size = %d\n", vector_loop_size);
3201 if (vector_loop_size == 1) {
3202 extract(partidx, local_subpartidx, npacks);
3203 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3205 Kokkos::parallel_for
3206 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3207 [&](
const local_ordinal_type &v) {
3208 const local_ordinal_type vbeg = v*internal_vector_length;
3209 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3210 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3213 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3216 member.team_barrier();
3217 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3222 KOKKOS_INLINE_FUNCTION
3224 operator() (
const ExtractBCDTag &,
const member_type &member)
const {
3226 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3227 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3228 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3230 const local_ordinal_type subpartidx = packptr_sub(packidx);
3231 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3232 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3233 const local_ordinal_type partidx = subpartidx%n_parts;
3235 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3239 if (vector_loop_size == 1) {
3240 extract(partidx, local_subpartidx, npacks);
3243 Kokkos::parallel_for
3244 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3245 [&](
const local_ordinal_type &v) {
3246 const local_ordinal_type vbeg = v*internal_vector_length;
3247 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3248 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3249 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3252 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3256 member.team_barrier();
3258 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3259 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
3261 const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3262 const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3264 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3265 printf(
"Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3269 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3270 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3272 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3273 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3277 KOKKOS_INLINE_FUNCTION
3279 operator() (
const ComputeETag &,
const member_type &member)
const {
3281 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3283 const local_ordinal_type subpartidx = packptr_sub(packidx);
3284 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3285 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3286 const local_ordinal_type partidx = subpartidx%n_parts;
3288 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3289 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3290 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3291 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3292 const local_ordinal_type num_vectors = blocksize;
3296 internal_vector_scratch_type_3d_view
3297 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
3298 if (local_subpartidx == 0) {
3299 Kokkos::parallel_for
3300 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3301 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3304 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
3305 Kokkos::parallel_for
3306 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3307 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3311 Kokkos::parallel_for
3312 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3313 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3314 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3319 KOKKOS_INLINE_FUNCTION
3321 operator() (
const ComputeSchurTag &,
const member_type &member)
const {
3323 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3324 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3325 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3327 const local_ordinal_type subpartidx = packptr_sub(packidx);
3328 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3329 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3330 const local_ordinal_type partidx = subpartidx%n_parts;
3333 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3337 internal_vector_scratch_type_3d_view
3338 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
3342 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
3343 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
3344 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
3346 for (local_ordinal_type i = 0; i < 4; ++i) {
3347 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3348 Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3351 member.team_barrier();
3353 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3355 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
3356 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
3358 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3359 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3361 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3362 <
typename execution_space::memory_space> default_mode_and_algo_type;
3364 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3365 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3367 Kokkos::parallel_for
3368 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3369 for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
3370 local_ordinal_type e_r, e_c, c_kps;
3372 if ( local_subpartidx_schur == 0 ) {
3378 else if ( i == 3 ) {
3383 else if ( i == 4 ) {
3398 else if ( i == 1 ) {
3403 else if ( i == 4 ) {
3408 else if ( i == 5 ) {
3418 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
3419 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3420 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3421 KB::Gemm<member_type,
3422 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3423 default_mode_type,default_algo_type>
3424 ::invoke(member, -one, C, E, one, S);
3429 KOKKOS_INLINE_FUNCTION
3431 operator() (
const FactorizeSchurTag &,
const member_type &member)
const {
3432 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3434 const local_ordinal_type subpartidx = packptr_sub(packidx);
3436 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3437 const local_ordinal_type partidx = subpartidx%n_parts;
3439 const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
3440 const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
3442 internal_vector_scratch_type_3d_view
3443 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
3445 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3446 printf(
"FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3449 if (vector_loop_size == 1) {
3450 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3452 Kokkos::parallel_for
3453 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3454 [&](
const local_ordinal_type &v) {
3455 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3461 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3462 const local_ordinal_type team_size =
3463 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3464 recommended_team_size(blocksize, vector_length, internal_vector_length);
3465 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3466 shmem_size(blocksize, blocksize, vector_loop_size);
3469 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3470 printf(
"Start ExtractAndFactorizeSubLineTag\n");
3472 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3473 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
3474 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3477 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3478 writeBTDValuesToFile(n_parts, scalar_values,
"before.mm");
3480 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3481 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3483 execution_space().fence();
3485 writeBTDValuesToFile(n_parts, scalar_values,
"after.mm");
3486 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3487 printf(
"End ExtractAndFactorizeSubLineTag\n");
3491 if (packindices_schur.extent(1) > 0)
3494 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3495 printf(
"Start ExtractBCDTag\n");
3497 Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3498 Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3500 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_before_extract.mm");
3503 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3504 Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
3505 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3507 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3508 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3510 execution_space().fence();
3513 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3514 printf(
"End ExtractBCDTag\n");
3516 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values,
"after_extraction_of_BCD.mm");
3517 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3518 printf(
"Start ComputeETag\n");
3520 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_extract.mm");
3522 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3523 Kokkos::TeamPolicy<execution_space,ComputeETag>
3524 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3526 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3527 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3529 execution_space().fence();
3531 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_compute.mm");
3533 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3534 printf(
"End ComputeETag\n");
3539 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3540 printf(
"Start ComputeSchurTag\n");
3542 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3543 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"before_schur.mm");
3544 Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
3545 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3547 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3548 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3550 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_schur.mm");
3551 execution_space().fence();
3552 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3553 printf(
"End ComputeSchurTag\n");
3558 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3559 printf(
"Start FactorizeSchurTag\n");
3561 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3562 Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
3563 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3564 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3565 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3567 execution_space().fence();
3568 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_factor_schur.mm");
3569 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3570 printf(
"End FactorizeSchurTag\n");
3575 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3583 template<
typename MatrixType>
3586 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3587 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3589 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
3590 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase", NumericPhase);
3591 ExtractAndFactorizeTridiags<MatrixType>
function(btdm, interf, A, G, tiny);
3593 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3599 template<
typename MatrixType>
3603 using execution_space =
typename impl_type::execution_space;
3604 using memory_space =
typename impl_type::memory_space;
3606 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3608 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3609 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
3610 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3611 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3612 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3613 using const_impl_scalar_type_2d_view_tpetra =
typename impl_scalar_type_2d_view_tpetra::const_type;
3614 static constexpr
int vector_length = impl_type::vector_length;
3616 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
3620 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3621 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3622 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3623 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3624 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3625 const local_ordinal_type blocksize;
3626 const local_ordinal_type num_vectors;
3629 vector_type_3d_view packed_multivector;
3630 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3632 template<
typename TagType>
3633 KOKKOS_INLINE_FUNCTION
3634 void copy_multivectors(
const local_ordinal_type &j,
3635 const local_ordinal_type &vi,
3636 const local_ordinal_type &pri,
3637 const local_ordinal_type &ri0)
const {
3638 for (local_ordinal_type col=0;col<num_vectors;++col)
3639 for (local_ordinal_type i=0;i<blocksize;++i)
3640 packed_multivector(pri, i, col)[vi] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3646 const vector_type_3d_view &pmv)
3647 : partptr(interf.partptr),
3648 packptr(interf.packptr),
3649 part2packrowidx0(interf.part2packrowidx0),
3650 part2rowidx0(interf.part2rowidx0),
3651 lclrow(interf.lclrow),
3652 blocksize(pmv.extent(1)),
3653 num_vectors(pmv.extent(2)),
3654 packed_multivector(pmv) {}
3658 KOKKOS_INLINE_FUNCTION
3660 operator() (
const local_ordinal_type &packidx)
const {
3661 local_ordinal_type partidx = packptr(packidx);
3662 local_ordinal_type npacks = packptr(packidx+1) - partidx;
3663 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3665 local_ordinal_type ri0[vector_length] = {};
3666 local_ordinal_type nrows[vector_length] = {};
3667 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
3668 ri0[v] = part2rowidx0(partidx);
3669 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
3671 for (local_ordinal_type j=0;j<nrows[0];++j) {
3672 local_ordinal_type cnt = 1;
3673 for (;cnt<npacks && j!= nrows[cnt];++cnt);
3675 const local_ordinal_type pri = pri0 + j;
3676 for (local_ordinal_type col=0;col<num_vectors;++col)
3677 for (local_ordinal_type i=0;i<blocksize;++i)
3678 for (local_ordinal_type v=0;v<npacks;++v)
3679 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
3683 KOKKOS_INLINE_FUNCTION
3685 operator() (
const member_type &member)
const {
3686 const local_ordinal_type packidx = member.league_rank();
3687 const local_ordinal_type partidx_begin = packptr(packidx);
3688 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
3689 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3690 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
3691 const local_ordinal_type partidx = partidx_begin + v;
3692 const local_ordinal_type ri0 = part2rowidx0(partidx);
3693 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
3696 const local_ordinal_type pri = pri0;
3697 for (local_ordinal_type col=0;col<num_vectors;++col) {
3698 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
3699 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
3703 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
3704 const local_ordinal_type pri = pri0 + j;
3705 for (local_ordinal_type col=0;col<num_vectors;++col)
3706 for (local_ordinal_type i=0;i<blocksize;++i)
3707 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3713 void run(
const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3714 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3715 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3717 scalar_multivector = scalar_multivector_;
3718 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3719 const local_ordinal_type vl = vector_length;
3720 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3721 Kokkos::parallel_for
3722 (
"MultiVectorConverter::TeamPolicy", policy, *
this);
3724 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3725 Kokkos::parallel_for
3726 (
"MultiVectorConverter::RangePolicy", policy, *
this);
3728 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3729 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3738 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3739 typedef KB::Mode::Serial mode_type;
3740 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3741 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3742 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3744 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3746 static int recommended_team_size(
const int ,
3753 #if defined(KOKKOS_ENABLE_CUDA)
3754 static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
3755 const int vector_length,
3756 const int internal_vector_length) {
3757 const int vector_size = vector_length/internal_vector_length;
3758 int total_team_size(0);
3759 if (blksize <= 5) total_team_size = 32;
3760 else if (blksize <= 9) total_team_size = 32;
3761 else if (blksize <= 12) total_team_size = 96;
3762 else if (blksize <= 16) total_team_size = 128;
3763 else if (blksize <= 20) total_team_size = 160;
3764 else total_team_size = 160;
3765 return total_team_size/vector_size;
3769 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3770 typedef KB::Mode::Team mode_type;
3771 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3772 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3773 static int recommended_team_size(
const int blksize,
3774 const int vector_length,
3775 const int internal_vector_length) {
3776 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3780 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3781 typedef KB::Mode::Team mode_type;
3782 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3783 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3784 static int recommended_team_size(
const int blksize,
3785 const int vector_length,
3786 const int internal_vector_length) {
3787 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3792 #if defined(KOKKOS_ENABLE_HIP)
3793 static inline int SolveTridiagsRecommendedHIPTeamSize(
const int blksize,
3794 const int vector_length,
3795 const int internal_vector_length) {
3796 const int vector_size = vector_length/internal_vector_length;
3797 int total_team_size(0);
3798 if (blksize <= 5) total_team_size = 32;
3799 else if (blksize <= 9) total_team_size = 32;
3800 else if (blksize <= 12) total_team_size = 96;
3801 else if (blksize <= 16) total_team_size = 128;
3802 else if (blksize <= 20) total_team_size = 160;
3803 else total_team_size = 160;
3804 return total_team_size/vector_size;
3808 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3809 typedef KB::Mode::Team mode_type;
3810 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3811 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3812 static int recommended_team_size(
const int blksize,
3813 const int vector_length,
3814 const int internal_vector_length) {
3815 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3819 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3820 typedef KB::Mode::Team mode_type;
3821 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3822 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3823 static int recommended_team_size(
const int blksize,
3824 const int vector_length,
3825 const int internal_vector_length) {
3826 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3831 #if defined(KOKKOS_ENABLE_SYCL)
3832 static inline int SolveTridiagsRecommendedSYCLTeamSize(
const int blksize,
3833 const int vector_length,
3834 const int internal_vector_length) {
3835 const int vector_size = vector_length/internal_vector_length;
3836 int total_team_size(0);
3837 if (blksize <= 5) total_team_size = 32;
3838 else if (blksize <= 9) total_team_size = 32;
3839 else if (blksize <= 12) total_team_size = 96;
3840 else if (blksize <= 16) total_team_size = 128;
3841 else if (blksize <= 20) total_team_size = 160;
3842 else total_team_size = 160;
3843 return total_team_size/vector_size;
3847 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
3848 typedef KB::Mode::Team mode_type;
3849 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3850 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3851 static int recommended_team_size(
const int blksize,
3852 const int vector_length,
3853 const int internal_vector_length) {
3854 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3858 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
3859 typedef KB::Mode::Team mode_type;
3860 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3861 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3862 static int recommended_team_size(
const int blksize,
3863 const int vector_length,
3864 const int internal_vector_length) {
3865 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3873 template<
typename MatrixType>
3874 struct SolveTridiags {
3876 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3877 using execution_space =
typename impl_type::execution_space;
3879 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3882 using magnitude_type =
typename impl_type::magnitude_type;
3883 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3884 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
3886 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3887 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
3888 using size_type_2d_view =
typename impl_type::size_type_2d_view;
3890 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3891 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
3892 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
3893 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
3895 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3897 using internal_vector_type =
typename impl_type::internal_vector_type;
3898 static constexpr
int vector_length = impl_type::vector_length;
3899 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
3902 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
3903 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3906 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3907 using member_type =
typename team_policy_type::member_type;
3911 local_ordinal_type n_subparts_per_part;
3912 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3913 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3914 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
3915 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
3916 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3917 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
3918 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3919 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
3921 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
3922 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
3925 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
3928 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
3929 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
3930 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
3932 internal_vector_type_4d_view X_internal_vector_values_schur;
3934 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
3935 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
3938 const local_ordinal_type vector_loop_size;
3941 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
3942 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
3943 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3945 Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3947 const impl_scalar_type df;
3948 const bool compute_diff;
3951 SolveTridiags(
const BlockHelperDetails::PartInterface<MatrixType> &interf,
3952 const BlockTridiags<MatrixType> &btdm,
3953 const vector_type_3d_view &pmv,
3954 const impl_scalar_type damping_factor,
3955 const bool is_norm_manager_active)
3958 n_subparts_per_part(interf.n_subparts_per_part),
3959 partptr(interf.partptr),
3960 packptr(interf.packptr),
3961 packindices_sub(interf.packindices_sub),
3962 packindices_schur(interf.packindices_schur),
3963 part2packrowidx0(interf.part2packrowidx0),
3964 part2packrowidx0_sub(interf.part2packrowidx0_sub),
3965 lclrow(interf.lclrow),
3966 packptr_sub(interf.packptr_sub),
3967 partptr_sub(interf.partptr_sub),
3968 pack_td_ptr_schur(btdm.pack_td_ptr_schur),
3970 pack_td_ptr(btdm.pack_td_ptr),
3971 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
3972 btdm.values.extent(0),
3973 btdm.values.extent(1),
3974 btdm.values.extent(2),
3975 vector_length/internal_vector_length),
3976 X_internal_vector_values((internal_vector_type*)pmv.data(),
3980 vector_length/internal_vector_length),
3981 X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
3987 2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
3990 vector_length/internal_vector_length),
3991 D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
3992 btdm.values_schur.extent(0),
3993 btdm.values_schur.extent(1),
3994 btdm.values_schur.extent(2),
3995 vector_length/internal_vector_length),
3996 e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
3997 btdm.e_values.extent(0),
3998 btdm.e_values.extent(1),
3999 btdm.e_values.extent(2),
4000 btdm.e_values.extent(3),
4001 vector_length/internal_vector_length),
4002 vector_loop_size(vector_length/internal_vector_length),
4003 Y_scalar_multivector(),
4006 compute_diff(is_norm_manager_active)
4012 KOKKOS_INLINE_FUNCTION
4014 copyToFlatMultiVector(
const member_type &member,
4015 const local_ordinal_type partidxbeg,
4016 const local_ordinal_type npacks,
4017 const local_ordinal_type pri0,
4018 const local_ordinal_type v,
4019 const local_ordinal_type blocksize,
4020 const local_ordinal_type num_vectors)
const {
4021 const local_ordinal_type vbeg = v*internal_vector_length;
4022 if (vbeg < npacks) {
4023 local_ordinal_type ri0_vals[internal_vector_length] = {};
4024 local_ordinal_type nrows_vals[internal_vector_length] = {};
4025 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4026 const local_ordinal_type partidx = partidxbeg+vv;
4027 ri0_vals[vi] = partptr(partidx);
4028 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
4031 impl_scalar_type z_partial_sum(0);
4032 if (nrows_vals[0] == 1) {
4033 const local_ordinal_type j=0, pri=pri0;
4035 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4036 const local_ordinal_type ri0 = ri0_vals[vi];
4037 const local_ordinal_type nrows = nrows_vals[vi];
4039 Kokkos::parallel_for
4040 (Kokkos::TeamThreadRange(member, blocksize),
4041 [&](
const local_ordinal_type &i) {
4042 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4043 for (local_ordinal_type col=0;col<num_vectors;++col) {
4044 impl_scalar_type &y = Y_scalar_multivector(row,col);
4045 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4049 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4050 z_partial_sum += yd_abs*yd_abs;
4058 Kokkos::parallel_for
4059 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
4060 [&](
const local_ordinal_type &j) {
4061 const local_ordinal_type pri = pri0 + j;
4062 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4063 const local_ordinal_type ri0 = ri0_vals[vi];
4064 const local_ordinal_type nrows = nrows_vals[vi];
4066 for (local_ordinal_type col=0;col<num_vectors;++col) {
4067 for (local_ordinal_type i=0;i<blocksize;++i) {
4068 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4069 impl_scalar_type &y = Y_scalar_multivector(row,col);
4070 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4074 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4075 z_partial_sum += yd_abs*yd_abs;
4084 Z_scalar_vector(member.league_rank()) += z_partial_sum;
4091 template<
typename WWViewType>
4092 KOKKOS_INLINE_FUNCTION
4094 solveSingleVector(
const member_type &member,
4095 const local_ordinal_type &blocksize,
4096 const local_ordinal_type &i0,
4097 const local_ordinal_type &r0,
4098 const local_ordinal_type &nrows,
4099 const local_ordinal_type &v,
4100 const WWViewType &WW)
const {
4102 typedef SolveTridiagsDefaultModeAndAlgo
4103 <
typename execution_space::memory_space> default_mode_and_algo_type;
4105 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4106 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4109 auto A = D_internal_vector_values.data();
4110 auto X = X_internal_vector_values.data();
4113 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4114 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4118 const local_ordinal_type astep = D_internal_vector_values.stride_0();
4119 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
4120 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
4121 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4122 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
4131 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4132 (default_mode_type,default_algo_type,
4135 blocksize,blocksize,
4140 for (local_ordinal_type tr=1;tr<nrows;++tr) {
4141 member.team_barrier();
4142 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4143 (default_mode_type,default_algo_type,
4145 blocksize, blocksize,
4147 A+2*astep, as0, as1,
4151 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4152 (default_mode_type,default_algo_type,
4155 blocksize,blocksize,
4157 A+3*astep, as0, as1,
4165 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4166 (default_mode_type,default_algo_type,
4169 blocksize, blocksize,
4174 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4176 member.team_barrier();
4177 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4178 (default_mode_type,default_algo_type,
4180 blocksize, blocksize,
4182 A+1*astep, as0, as1,
4186 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4187 (default_mode_type,default_algo_type,
4190 blocksize, blocksize,
4199 const local_ordinal_type ws0 = WW.stride_0();
4200 auto W = WW.data() + v;
4201 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
4203 member, blocksize, X, xs0, W, ws0);
4204 member.team_barrier();
4205 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4206 (default_mode_type,default_algo_type,
4208 blocksize, blocksize,
4217 template<
typename WWViewType>
4218 KOKKOS_INLINE_FUNCTION
4220 solveMultiVector(
const member_type &member,
4221 const local_ordinal_type &,
4222 const local_ordinal_type &i0,
4223 const local_ordinal_type &r0,
4224 const local_ordinal_type &nrows,
4225 const local_ordinal_type &v,
4226 const WWViewType &WW)
const {
4228 typedef SolveTridiagsDefaultModeAndAlgo
4229 <
typename execution_space::memory_space> default_mode_and_algo_type;
4231 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4232 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4235 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4236 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4239 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4240 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4243 local_ordinal_type i = i0, r = r0;
4248 KB::Trsm<member_type,
4249 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4250 default_mode_type,default_algo_type>
4251 ::invoke(member, one, A, X1);
4252 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
4253 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
4254 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
4255 member.team_barrier();
4256 KB::Gemm<member_type,
4257 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4258 default_mode_type,default_algo_type>
4259 ::invoke(member, -one, A, X1, one, X2);
4260 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
4261 KB::Trsm<member_type,
4262 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4263 default_mode_type,default_algo_type>
4264 ::invoke(member, one, A, X2);
4265 X1.assign_data( X2.data() );
4269 KB::Trsm<member_type,
4270 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4271 default_mode_type,default_algo_type>
4272 ::invoke(member, one, A, X1);
4273 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4275 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
4276 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
4277 member.team_barrier();
4278 KB::Gemm<member_type,
4279 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4280 default_mode_type,default_algo_type>
4281 ::invoke(member, -one, A, X1, one, X2);
4283 A.assign_data( &D_internal_vector_values(i,0,0,v) );
4284 KB::Trsm<member_type,
4285 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4286 default_mode_type,default_algo_type>
4287 ::invoke(member, one, A, X2);
4288 X1.assign_data( X2.data() );
4292 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4293 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
4294 ::invoke(member, X1, W);
4295 member.team_barrier();
4296 KB::Gemm<member_type,
4297 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4298 default_mode_type,default_algo_type>
4299 ::invoke(member, one, A, W, zero, X1);
4303 template<
int B>
struct SingleVectorTag {};
4304 template<
int B>
struct MultiVectorTag {};
4306 template<
int B>
struct SingleVectorSubLineTag {};
4307 template<
int B>
struct MultiVectorSubLineTag {};
4308 template<
int B>
struct SingleVectorApplyCTag {};
4309 template<
int B>
struct MultiVectorApplyCTag {};
4310 template<
int B>
struct SingleVectorSchurTag {};
4311 template<
int B>
struct MultiVectorSchurTag {};
4312 template<
int B>
struct SingleVectorApplyETag {};
4313 template<
int B>
struct MultiVectorApplyETag {};
4314 template<
int B>
struct SingleVectorCopyToFlatTag {};
4315 template<
int B>
struct SingleZeroingTag {};
4318 KOKKOS_INLINE_FUNCTION
4320 operator() (
const SingleVectorTag<B> &,
const member_type &member)
const {
4321 const local_ordinal_type packidx = member.league_rank();
4322 const local_ordinal_type partidx = packptr(packidx);
4323 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4324 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4325 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4326 const local_ordinal_type r0 = part2packrowidx0(partidx);
4327 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4328 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4329 const local_ordinal_type num_vectors = 1;
4330 internal_vector_scratch_type_3d_view
4331 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4332 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4333 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4335 Kokkos::parallel_for
4336 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4337 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4338 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4343 KOKKOS_INLINE_FUNCTION
4345 operator() (
const MultiVectorTag<B> &,
const member_type &member)
const {
4346 const local_ordinal_type packidx = member.league_rank();
4347 const local_ordinal_type partidx = packptr(packidx);
4348 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4349 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4350 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4351 const local_ordinal_type r0 = part2packrowidx0(partidx);
4352 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4353 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4354 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4356 internal_vector_scratch_type_3d_view
4357 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4358 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4359 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4361 Kokkos::parallel_for
4362 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4363 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4364 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4369 KOKKOS_INLINE_FUNCTION
4371 operator() (
const SingleVectorSubLineTag<B> &,
const member_type &member)
const {
4373 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4375 const local_ordinal_type subpartidx = packptr_sub(packidx);
4376 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4377 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4378 const local_ordinal_type partidx = subpartidx%n_parts;
4380 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4381 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4382 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4383 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4384 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4390 internal_vector_scratch_type_3d_view
4391 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4393 Kokkos::parallel_for
4394 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4395 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4400 KOKKOS_INLINE_FUNCTION
4402 operator() (
const SingleVectorApplyCTag<B> &,
const member_type &member)
const {
4405 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4407 const local_ordinal_type subpartidx = packptr_sub(packidx);
4408 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4409 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4410 const local_ordinal_type partidx = subpartidx%n_parts;
4411 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4414 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4415 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4416 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4418 internal_vector_scratch_type_3d_view
4419 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4423 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
4424 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
4425 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
4430 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4432 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
4433 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
4435 typedef SolveTridiagsDefaultModeAndAlgo
4436 <
typename execution_space::memory_space> default_mode_and_algo_type;
4438 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4439 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4441 if (local_subpartidx == 0) {
4442 Kokkos::parallel_for
4443 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4444 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4445 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4446 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4448 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4449 (default_mode_type,default_algo_type,
4451 blocksize, blocksize,
4453 C.data(), C.stride_0(), C.stride_1(),
4454 v_1.data(), v_1.stride_0(),
4456 v_2.data(), v_2.stride_0());
4459 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4460 Kokkos::parallel_for
4461 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4462 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4463 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4464 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4466 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4467 (default_mode_type,default_algo_type,
4469 blocksize, blocksize,
4471 C.data(), C.stride_0(), C.stride_1(),
4472 v_1.data(), v_1.stride_0(),
4474 v_2.data(), v_2.stride_0());
4478 Kokkos::parallel_for
4479 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4481 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4482 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4483 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4485 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4486 (default_mode_type,default_algo_type,
4488 blocksize, blocksize,
4490 C.data(), C.stride_0(), C.stride_1(),
4491 v_1.data(), v_1.stride_0(),
4493 v_2.data(), v_2.stride_0());
4496 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4497 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4498 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4500 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4501 (default_mode_type,default_algo_type,
4503 blocksize, blocksize,
4505 C.data(), C.stride_0(), C.stride_1(),
4506 v_1.data(), v_1.stride_0(),
4508 v_2.data(), v_2.stride_0());
4515 KOKKOS_INLINE_FUNCTION
4517 operator() (
const SingleVectorSchurTag<B> &,
const member_type &member)
const {
4518 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4520 const local_ordinal_type partidx = packptr_sub(packidx);
4522 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4524 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
4525 const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
4527 const local_ordinal_type r0_schur = nrows * member.league_rank();
4529 internal_vector_scratch_type_3d_view
4530 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4532 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4533 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4534 for (local_ordinal_type i = 0; i < 2; ++i) {
4535 copy3DView<local_ordinal_type>(member,
4536 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4537 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4541 Kokkos::parallel_for
4542 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4543 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4546 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4547 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4548 for (local_ordinal_type i = 0; i < 2; ++i) {
4549 copy3DView<local_ordinal_type>(member,
4550 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4551 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4557 KOKKOS_INLINE_FUNCTION
4559 operator() (
const SingleVectorApplyETag<B> &,
const member_type &member)
const {
4560 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4562 const local_ordinal_type subpartidx = packptr_sub(packidx);
4563 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4564 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4565 const local_ordinal_type partidx = subpartidx%n_parts;
4566 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4568 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4569 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4571 internal_vector_scratch_type_3d_view
4572 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4576 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4578 typedef SolveTridiagsDefaultModeAndAlgo
4579 <
typename execution_space::memory_space> default_mode_and_algo_type;
4581 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4582 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4584 if (local_subpartidx == 0) {
4585 Kokkos::parallel_for
4586 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4588 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4590 for (local_ordinal_type row = 0; row < nrows; ++row) {
4591 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4592 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4594 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4595 (default_mode_type,default_algo_type,
4597 blocksize, blocksize,
4599 E.data(), E.stride_0(), E.stride_1(),
4600 v_2.data(), v_2.stride_0(),
4602 v_1.data(), v_1.stride_0());
4606 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4607 Kokkos::parallel_for
4608 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4609 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4611 for (local_ordinal_type row = 0; row < nrows; ++row) {
4612 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4613 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4615 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4616 (default_mode_type,default_algo_type,
4618 blocksize, blocksize,
4620 E.data(), E.stride_0(), E.stride_1(),
4621 v_2.data(), v_2.stride_0(),
4623 v_1.data(), v_1.stride_0());
4628 Kokkos::parallel_for
4629 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4631 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4633 for (local_ordinal_type row = 0; row < nrows; ++row) {
4634 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4635 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4637 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4638 (default_mode_type,default_algo_type,
4640 blocksize, blocksize,
4642 E.data(), E.stride_0(), E.stride_1(),
4643 v_2.data(), v_2.stride_0(),
4645 v_1.data(), v_1.stride_0());
4649 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4651 for (local_ordinal_type row = 0; row < nrows; ++row) {
4652 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4653 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4655 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4656 (default_mode_type,default_algo_type,
4658 blocksize, blocksize,
4660 E.data(), E.stride_0(), E.stride_1(),
4661 v_2.data(), v_2.stride_0(),
4663 v_1.data(), v_1.stride_0());
4671 KOKKOS_INLINE_FUNCTION
4673 operator() (
const SingleVectorCopyToFlatTag<B> &,
const member_type &member)
const {
4674 const local_ordinal_type packidx = member.league_rank();
4675 const local_ordinal_type partidx = packptr(packidx);
4676 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4677 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4678 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4679 const local_ordinal_type num_vectors = 1;
4681 Kokkos::parallel_for
4682 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4683 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4688 KOKKOS_INLINE_FUNCTION
4690 operator() (
const SingleZeroingTag<B> &,
const member_type &member)
const {
4691 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4692 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4696 void run(
const impl_scalar_type_2d_view_tpetra &Y,
4697 const impl_scalar_type_1d_view &Z) {
4698 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4699 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SolveTridiags", SolveTridiags);
4702 this->Y_scalar_multivector = Y;
4703 this->Z_scalar_vector = Z;
4705 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4706 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4708 const local_ordinal_type team_size =
4709 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4710 recommended_team_size(blocksize, vector_length, internal_vector_length);
4711 const int per_team_scratch = internal_vector_scratch_type_3d_view
4712 ::shmem_size(blocksize, num_vectors, vector_loop_size);
4714 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4715 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4716 if (num_vectors == 1) { \
4717 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4718 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4719 Kokkos::parallel_for \
4720 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4721 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4723 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4724 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4725 Kokkos::parallel_for \
4726 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4727 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4730 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4731 if (num_vectors == 1) { \
4732 if (packindices_schur.extent(1) <= 0) { \
4733 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4734 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4735 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4736 Kokkos::parallel_for \
4737 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4743 Kokkos::TeamPolicy<execution_space,SingleZeroingTag<B> > \
4744 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4745 Kokkos::parallel_for \
4746 ("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4750 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4751 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4752 Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> > \
4753 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4754 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4755 Kokkos::parallel_for \
4756 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4758 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4759 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4762 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4763 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4764 Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> > \
4765 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4766 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4767 Kokkos::parallel_for \
4768 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4770 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4771 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4774 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4775 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4776 Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> > \
4777 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4778 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4779 Kokkos::parallel_for \
4780 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4782 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4783 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4786 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4787 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4788 Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> > \
4789 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4790 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4791 Kokkos::parallel_for \
4792 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4794 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4795 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4799 Kokkos::TeamPolicy<execution_space,SingleVectorCopyToFlatTag<B> > \
4800 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4801 Kokkos::parallel_for \
4802 ("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4807 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4808 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4809 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4810 Kokkos::parallel_for \
4811 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4815 switch (blocksize) {
4816 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
4817 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
4818 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 6);
4819 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
4820 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4821 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4822 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4823 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4824 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4825 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4826 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4827 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4828 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
4830 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4832 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4833 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4840 template<
typename MatrixType>
4843 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
4844 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
4845 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4846 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
4847 const bool overlap_communication_and_computation,
4849 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
4850 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
4851 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z,
4852 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
4854 const BlockHelperDetails::PartInterface<MatrixType> &interf,
4857 typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work,
4862 const int max_num_sweeps,
4863 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4864 const int check_tol_every) {
4865 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
4868 using node_memory_space =
typename impl_type::node_memory_space;
4869 using local_ordinal_type =
typename impl_type::local_ordinal_type;
4870 using size_type =
typename impl_type::size_type;
4871 using impl_scalar_type =
typename impl_type::impl_scalar_type;
4872 using magnitude_type =
typename impl_type::magnitude_type;
4873 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
4874 using vector_type_1d_view =
typename impl_type::vector_type_1d_view;
4875 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
4876 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
4878 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
4882 "Neither Tpetra importer nor Async importer is null.");
4885 "Maximum number of sweeps must be >= 1.");
4888 const bool is_seq_method_requested = !tpetra_importer.is_null();
4889 const bool is_async_importer_active = !async_importer.is_null();
4890 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4891 const magnitude_type tolerance = tol*tol;
4892 const local_ordinal_type blocksize = btdm.values.extent(1);
4893 const local_ordinal_type num_vectors = Y.getNumVectors();
4894 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4896 const impl_scalar_type zero(0.0);
4898 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4899 "The seq method for applyInverseJacobi, " <<
4900 "which in any case is for developer use only, " <<
4901 "does not support norm-based termination.");
4902 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4903 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4905 std::invalid_argument,
4906 "The seq method for applyInverseJacobi, " <<
4907 "which in any case is for developer use only, " <<
4908 "only supports memory spaces accessible from host.");
4911 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4912 if (work.span() < work_span_required)
4913 work = vector_type_1d_view(
"vector workspace 1d view", work_span_required);
4916 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4917 if (local_ordinal_type(W.extent(0)) < W_size)
4918 W = impl_scalar_type_1d_view(
"W", W_size);
4920 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4922 if (is_seq_method_requested) {
4923 if (Z.getNumVectors() != Y.getNumVectors())
4924 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors,
false);
4926 if (is_async_importer_active) {
4928 async_importer->createDataBuffer(num_vectors);
4929 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4935 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4936 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4937 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4938 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4939 if (is_y_zero) Kokkos::deep_copy(YY, zero);
4942 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4943 damping_factor, is_norm_manager_active);
4945 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4948 auto A_crs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_crs_matrix_type>(A);
4949 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_block_crs_matrix_type>(A);
4951 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
4954 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
4956 BlockHelperDetails::ComputeResidualVector<MatrixType>
4957 compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
4958 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
4961 if (is_norm_manager_active)
4962 norm_manager.setCheckFrequency(check_tol_every);
4966 for (;sweep<max_num_sweeps;++sweep) {
4970 multivector_converter.run(XX);
4972 if (is_seq_method_requested) {
4976 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
4977 compute_residual_vector.run(YY, XX, ZZ);
4980 multivector_converter.run(YY);
4984 if (overlap_communication_and_computation || !is_async_importer_active) {
4985 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
4986 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
true);
4987 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
4988 if (is_async_importer_active) async_importer->cancel();
4991 if (is_async_importer_active) {
4992 async_importer->syncRecv();
4993 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
false);
4996 if (is_async_importer_active)
4997 async_importer->syncExchange(YY);
4998 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
4999 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5007 solve_tridiags.run(YY, W);
5010 if (is_norm_manager_active) {
5012 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5013 if (sweep + 1 == max_num_sweeps) {
5014 norm_manager.ireduce(sweep,
true);
5015 norm_manager.checkDone(sweep + 1, tolerance,
true);
5017 norm_manager.ireduce(sweep);
5025 if (is_norm_manager_active) norm_manager.finalize();
5031 template<
typename MatrixType>
5034 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5035 using block_tridiags_type = BlockTridiags<MatrixType>;
5038 using async_import_type = AsyncableImport<MatrixType>;
5045 bool overlap_communication_and_computation;
5048 mutable typename impl_type::tpetra_multivector_type Z;
5049 mutable typename impl_type::impl_scalar_type_1d_view W;
5052 part_interface_type part_interface;
5053 block_tridiags_type block_tridiags;
5055 mutable typename impl_type::vector_type_1d_view work;
5056 mutable norm_manager_type norm_manager;
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:140
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:884
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:4842
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1620
Definition: Ifpack2_BlockHelper.hpp:350
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:96
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3585
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1862
Definition: Ifpack2_BlockHelper.hpp:188
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1043
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2186
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:320
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:163
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:215
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1565
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3600