10 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
18 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19 #include <Tpetra_Distributor.hpp>
20 #include <Tpetra_BlockMultiVector.hpp>
22 #include <Kokkos_ArithTraits.hpp>
23 #include <KokkosBatched_Util.hpp>
24 #include <KokkosBatched_Vector.hpp>
25 #include <KokkosBatched_Copy_Decl.hpp>
26 #include <KokkosBatched_Copy_Impl.hpp>
27 #include <KokkosBatched_AddRadial_Decl.hpp>
28 #include <KokkosBatched_AddRadial_Impl.hpp>
29 #include <KokkosBatched_SetIdentity_Decl.hpp>
30 #include <KokkosBatched_SetIdentity_Impl.hpp>
31 #include <KokkosBatched_Gemm_Decl.hpp>
32 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
33 #include <KokkosBatched_Gemm_Team_Impl.hpp>
34 #include <KokkosBatched_Gemv_Decl.hpp>
35 #include <KokkosBatched_Gemv_Team_Impl.hpp>
36 #include <KokkosBatched_Trsm_Decl.hpp>
37 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
38 #include <KokkosBatched_Trsm_Team_Impl.hpp>
39 #include <KokkosBatched_Trsv_Decl.hpp>
40 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
41 #include <KokkosBatched_Trsv_Team_Impl.hpp>
42 #include <KokkosBatched_LU_Decl.hpp>
43 #include <KokkosBatched_LU_Serial_Impl.hpp>
44 #include <KokkosBatched_LU_Team_Impl.hpp>
46 #include <KokkosBlas1_nrm1.hpp>
47 #include <KokkosBlas1_nrm2.hpp>
51 #include "Ifpack2_BlockHelper.hpp"
52 #include "Ifpack2_BlockComputeResidualVector.hpp"
53 #include "Ifpack2_BlockComputeResidualAndSolve.hpp"
60 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
61 #include "cuda_profiler_api.h"
66 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
74 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
78 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
81 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
82 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
86 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
90 namespace BlockTriDiContainerDetails {
92 namespace KB = KokkosBatched;
99 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
100 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
101 MemoryTraitsType::is_random_access |
104 template <
typename ViewType>
105 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
106 typename ViewType::array_layout,
107 typename ViewType::device_type,
108 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
109 template <
typename ViewType>
110 using Atomic = Kokkos::View<
typename ViewType::data_type,
111 typename ViewType::array_layout,
112 typename ViewType::device_type,
113 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
114 template <
typename ViewType>
115 using Const = Kokkos::View<
typename ViewType::const_data_type,
116 typename ViewType::array_layout,
117 typename ViewType::device_type,
118 typename ViewType::memory_traits>;
119 template <
typename ViewType>
120 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
122 template <
typename ViewType>
123 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
125 template <
typename ViewType>
126 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
127 typename ViewType::array_layout,
128 typename ViewType::device_type,
129 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
132 template <
typename ViewType>
133 using Scratch = Kokkos::View<
typename ViewType::data_type,
134 typename ViewType::array_layout,
135 typename ViewType::execution_space::scratch_memory_space,
136 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
142 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
147 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
148 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
149 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
151 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
152 { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
154 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
156 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
162 template<
typename MatrixType>
165 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
167 using tpetra_map_type =
typename impl_type::tpetra_map_type;
168 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
169 using tpetra_import_type =
typename impl_type::tpetra_import_type;
170 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
171 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
173 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
174 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
176 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
179 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
181 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
182 const auto src =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
183 const auto tgt =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
184 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
193 template<
typename MatrixType>
194 struct AsyncableImport {
202 #if !defined(HAVE_IFPACK2_MPI)
203 typedef int MPI_Request;
204 typedef int MPI_Comm;
206 using scalar_type =
typename impl_type::scalar_type;
210 static int isend(
const MPI_Comm comm,
const char* buf,
int count,
int dest,
int tag, MPI_Request* ireq) {
211 #ifdef HAVE_IFPACK2_MPI
213 int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
214 if (ireq == NULL) MPI_Request_free(&ureq);
221 static int irecv(
const MPI_Comm comm,
char* buf,
int count,
int src,
int tag, MPI_Request* ireq) {
222 #ifdef HAVE_IFPACK2_MPI
224 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
225 if (ireq == NULL) MPI_Request_free(&ureq);
232 static int waitany(
int count, MPI_Request* reqs,
int* index) {
233 #ifdef HAVE_IFPACK2_MPI
234 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
240 static int waitall(
int count, MPI_Request* reqs) {
241 #ifdef HAVE_IFPACK2_MPI
242 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
249 using tpetra_map_type =
typename impl_type::tpetra_map_type;
250 using tpetra_import_type =
typename impl_type::tpetra_import_type;
252 using local_ordinal_type =
typename impl_type::local_ordinal_type;
253 using global_ordinal_type =
typename impl_type::global_ordinal_type;
257 using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
258 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
260 using execution_space =
typename impl_type::execution_space;
261 using memory_space =
typename impl_type::memory_space;
262 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
264 using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
266 #if defined(KOKKOS_ENABLE_CUDA)
267 using impl_scalar_type_1d_view =
268 typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
269 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
270 Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
271 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
272 Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
273 # else // no experimental macros are defined
274 typename impl_type::impl_scalar_type_1d_view,
276 typename impl_type::impl_scalar_type_1d_view>::type;
278 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
280 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type*,Kokkos::HostSpace>;
281 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
282 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
284 #ifdef HAVE_IFPACK2_MPI
288 impl_scalar_type_2d_view_tpetra remote_multivector;
289 local_ordinal_type blocksize;
292 struct SendRecvPair {
297 SendRecvPair<int_1d_view_host> pids;
298 SendRecvPair<std::vector<MPI_Request> > reqs;
299 SendRecvPair<size_type_1d_view> offset;
300 SendRecvPair<size_type_1d_view_host> offset_host;
301 SendRecvPair<local_ordinal_type_1d_view> lids;
302 SendRecvPair<impl_scalar_type_1d_view> buffer;
303 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host;
305 local_ordinal_type_1d_view dm2cm;
307 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
308 using exec_instance_1d_std_vector = std::vector<execution_space>;
309 exec_instance_1d_std_vector exec_instances;
315 const size_type_1d_view &offs) {
317 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
318 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
321 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
322 const local_ordinal_type lens_size = lens_device.extent(0);
323 Kokkos::parallel_scan
324 (
"AsyncableImport::RangePolicy::setOffsetValues",
325 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
328 update += (i < lens_size ? lens_device[i] : 0);
333 const size_type_1d_view_host &offs) {
335 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
336 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
340 for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
341 offs(i) = offs(i-1) + lens[i-1];
346 void createMpiRequests(
const tpetra_import_type &
import) {
347 Tpetra::Distributor &distributor =
import.getDistributor();
350 const auto pids_from = distributor.getProcsFrom();
352 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(int)*pids.recv.extent(0));
354 const auto pids_to = distributor.getProcsTo();
356 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(int)*pids.send.extent(0));
359 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*
sizeof(MPI_Request));
360 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*
sizeof(MPI_Request));
364 const auto lengths_to = distributor.getLengthsTo();
367 const auto lengths_from = distributor.getLengthsFrom();
370 setOffsetValues(lengths_to, offset.send);
371 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
373 setOffsetValues(lengths_from, offset.recv);
374 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
376 const auto lengths_to = distributor.getLengthsTo();
377 offset_host.send = size_type_1d_view_host(
do_not_initialize_tag(
"offset send"), lengths_to.size() + 1);
379 const auto lengths_from = distributor.getLengthsFrom();
380 offset_host.recv = size_type_1d_view_host(
do_not_initialize_tag(
"offset recv"), lengths_from.size() + 1);
382 setOffsetValuesHost(lengths_to, offset_host.send);
385 setOffsetValuesHost(lengths_from, offset_host.recv);
390 void createSendRecvIDs(
const tpetra_import_type &
import) {
392 const auto remote_lids =
import.getRemoteLIDs();
393 const local_ordinal_type_1d_view_host
394 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
396 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
399 auto epids =
import.getExportPIDs();
400 auto elids =
import.getExportLIDs();
403 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
406 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
407 const auto pid_send_value = pids.send[i];
408 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
409 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
410 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
412 Kokkos::deep_copy(lids.send, lids_send_host);
415 void createExecutionSpaceInstances() {
416 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
418 #if KOKKOS_VERSION >= 40699
420 Kokkos::Experimental::partition_space(execution_space(), std::vector<int>(8, 1));
423 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
431 struct ToMultiVector {};
435 const local_ordinal_type blocksize_,
436 const local_ordinal_type_1d_view dm2cm_) {
437 blocksize = blocksize_;
440 #ifdef HAVE_IFPACK2_MPI
441 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
443 const tpetra_import_type
import(src_map, tgt_map);
445 createMpiRequests(
import);
446 createSendRecvIDs(
import);
447 createExecutionSpaceInstances();
450 void createDataBuffer(
const local_ordinal_type &num_vectors) {
451 const size_type extent_0 = lids.recv.extent(0)*blocksize;
452 const size_type extent_1 = num_vectors;
453 if (remote_multivector.extent(0) == extent_0 &&
454 remote_multivector.extent(1) == extent_1) {
460 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
461 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
466 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
467 buffer_host.send = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer send"), send_buffer_size);
468 buffer_host.recv = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer recv"), recv_buffer_size);
474 #ifdef HAVE_IFPACK2_MPI
475 waitall(reqs.recv.size(), reqs.recv.data());
476 waitall(reqs.send.size(), reqs.send.data());
484 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
485 template<
typename PackTag>
487 void copy(
const local_ordinal_type_1d_view &lids_,
488 const impl_scalar_type_1d_view &buffer_,
489 const local_ordinal_type ibeg_,
490 const local_ordinal_type iend_,
491 const impl_scalar_type_2d_view_tpetra &multivector_,
492 const local_ordinal_type blocksize_,
493 const execution_space &exec_instance_) {
494 const local_ordinal_type num_vectors = multivector_.extent(1);
495 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
496 const local_ordinal_type idiff = iend_ - ibeg_;
497 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
499 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
500 local_ordinal_type vector_size(0);
501 if (blocksize_ <= 4) vector_size = 4;
502 else if (blocksize_ <= 8) vector_size = 8;
503 else if (blocksize_ <= 16) vector_size = 16;
504 else vector_size = 32;
506 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
507 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
510 Kokkos::Experimental::require(policy, work_item_property),
511 KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
512 const local_ordinal_type i = member.league_rank();
514 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
515 auto aptr = abase + blocksize_*(i + idiff*j);
516 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
517 if (std::is_same<PackTag,ToBuffer>::value)
519 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
524 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
531 void asyncSendRecvVar1(
const impl_scalar_type_2d_view_tpetra &mv) {
532 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
534 #ifdef HAVE_IFPACK2_MPI
536 const local_ordinal_type num_vectors = mv.extent(1);
537 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
540 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
541 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
543 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
544 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
551 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
552 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
560 execution_space().fence();
563 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
565 if (i<8) exec_instances[i%8].fence();
566 copy<ToBuffer>(lids.send, buffer.send,
567 offset_host.send(i), offset_host.send(i+1),
570 exec_instances[i%8]);
571 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
573 const local_ordinal_type num_vectors = mv.extent(1);
574 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
576 Kokkos::deep_copy(exec_instances[i%8],
577 Kokkos::subview(buffer_host.send,
578 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
579 offset_host.send(i)*mv_blocksize,
580 offset_host.send(i+1)*mv_blocksize)),
581 Kokkos::subview(buffer.send,
582 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
583 offset_host.send(i)*mv_blocksize,
584 offset_host.send(i+1)*mv_blocksize)));
589 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
591 if (i<8) exec_instances[i%8].fence();
592 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
594 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
595 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
602 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
603 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
611 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
614 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
616 #endif // HAVE_IFPACK2_MPI
617 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
620 void syncRecvVar1() {
621 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
622 #ifdef HAVE_IFPACK2_MPI
624 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
625 local_ordinal_type idx = i;
628 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
630 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
631 const local_ordinal_type num_vectors = remote_multivector.extent(1);
632 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
635 Kokkos::subview(buffer.recv,
636 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
637 offset_host.recv(idx)*mv_blocksize,
638 offset_host.recv(idx+1)*mv_blocksize)),
639 Kokkos::subview(buffer_host.recv,
640 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
641 offset_host.recv(idx)*mv_blocksize,
642 offset_host.recv(idx+1)*mv_blocksize)));
646 copy<ToMultiVector>(lids.recv, buffer.recv,
647 offset_host.recv(idx), offset_host.recv(idx+1),
648 remote_multivector, blocksize,
649 exec_instances[idx%8]);
656 waitall(reqs.send.size(), reqs.send.data());
657 #endif // HAVE_IFPACK2_MPI
658 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
660 #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
667 template<
typename PackTag>
669 void copy(
const local_ordinal_type_1d_view &lids_,
670 const impl_scalar_type_1d_view &buffer_,
671 const local_ordinal_type &ibeg_,
672 const local_ordinal_type &iend_,
673 const impl_scalar_type_2d_view_tpetra &multivector_,
674 const local_ordinal_type blocksize_) {
675 const local_ordinal_type num_vectors = multivector_.extent(1);
676 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
677 const local_ordinal_type idiff = iend_ - ibeg_;
678 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
679 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
680 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
681 local_ordinal_type vector_size(0);
682 if (blocksize_ <= 4) vector_size = 4;
683 else if (blocksize_ <= 8) vector_size = 8;
684 else if (blocksize_ <= 16) vector_size = 16;
685 else vector_size = 32;
686 const team_policy_type policy(idiff, 1, vector_size);
688 (
"AsyncableImport::TeamPolicy::copy",
689 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
690 const local_ordinal_type i = member.league_rank();
692 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
693 auto aptr = abase + blocksize_*(i + idiff*j);
694 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
695 if (std::is_same<PackTag,ToBuffer>::value)
697 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
702 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
708 const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
710 (
"AsyncableImport::RangePolicy::copy",
711 policy, KOKKOS_LAMBDA(
const local_ordinal_type &ij) {
712 const local_ordinal_type i = ij%idiff;
713 const local_ordinal_type j = ij/idiff;
714 auto aptr = abase + blocksize_*(i + idiff*j);
715 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
716 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
717 auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
718 memcpy(to, from,
sizeof(impl_scalar_type)*blocksize_);
727 void asyncSendRecvVar0(
const impl_scalar_type_2d_view_tpetra &mv) {
728 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
730 #ifdef HAVE_IFPACK2_MPI
732 const local_ordinal_type num_vectors = mv.extent(1);
733 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
736 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
737 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
739 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
740 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
747 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
748 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
756 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
757 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
760 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
762 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
763 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
770 Kokkos::subview(buffer_host.send,
771 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
772 offset_host.send(i)*mv_blocksize,
773 offset_host.send(i+1)*mv_blocksize)),
774 Kokkos::subview(buffer.send,
775 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
776 offset_host.send(i)*mv_blocksize,
777 offset_host.send(i+1)*mv_blocksize)));
779 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
780 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
789 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
792 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
795 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
798 void syncRecvVar0() {
799 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
800 #ifdef HAVE_IFPACK2_MPI
802 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
803 local_ordinal_type idx = i;
804 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
805 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
806 const local_ordinal_type num_vectors = remote_multivector.extent(1);
807 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
809 Kokkos::subview(buffer.recv,
810 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
811 offset_host.recv(idx)*mv_blocksize,
812 offset_host.recv(idx+1)*mv_blocksize)),
813 Kokkos::subview(buffer_host.recv,
814 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
815 offset_host.recv(idx)*mv_blocksize,
816 offset_host.recv(idx+1)*mv_blocksize)));
818 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
819 remote_multivector, blocksize);
822 waitall(reqs.send.size(), reqs.send.data());
824 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
830 void asyncSendRecv(
const impl_scalar_type_2d_view_tpetra &mv) {
831 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
832 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
833 asyncSendRecvVar1(mv);
835 asyncSendRecvVar0(mv);
838 asyncSendRecvVar0(mv);
842 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
843 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
853 void syncExchange(
const impl_scalar_type_2d_view_tpetra &mv) {
854 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
857 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
860 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
863 template <
typename ViewType1,
typename ViewType2>
864 struct are_same_struct {
868 are_same_struct(ViewType1 keys1_, ViewType2 keys2_) : keys1(keys1_), keys2(keys2_) {}
869 KOKKOS_INLINE_FUNCTION
870 void operator()(
int i,
unsigned int& count)
const {
871 if (keys1(i) != keys2(i)) count++;
875 template <
typename ViewType1,
typename ViewType2>
876 bool are_same (ViewType1 keys1, ViewType2 keys2) {
877 unsigned int are_same_ = 0;
879 Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
880 are_same_struct(keys1, keys2),
888 template<
typename MatrixType>
893 using tpetra_map_type =
typename impl_type::tpetra_map_type;
894 using local_ordinal_type =
typename impl_type::local_ordinal_type;
895 using global_ordinal_type =
typename impl_type::global_ordinal_type;
896 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
897 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
898 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
899 using global_indices_array_device_type = Kokkos::View<const global_ordinal_type*, typename tpetra_map_type::device_type>;
901 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
902 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
904 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
907 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
909 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
910 const auto domain_map = g.getDomainMap();
911 const auto column_map = g.getColMap();
913 std::vector<global_ordinal_type> gids;
915 Kokkos::Subview<global_indices_array_device_type, std::pair<int,int>> column_map_global_iD_last;
917 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
919 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
921 global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
922 global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
924 if(are_same(domain_map_global_iD, column_map_global_iD)) {
926 separate_remotes =
true;
927 need_owned_permutation =
false;
929 column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
930 std::pair<int,int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
934 for (
size_t i=0;i<column_map->getLocalNumElements();++i) {
935 const global_ordinal_type gid = column_map->getGlobalElement(i);
936 if (!domain_map->isNodeGlobalElement(gid)) {
939 }
else if (found_first) {
940 separate_remotes =
false;
943 if (!found_first && !need_owned_permutation &&
944 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
953 need_owned_permutation =
true;
957 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
960 if (separate_remotes) {
961 IFPACK2_BLOCKHELPER_TIMER(
"createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
963 const auto parsimonious_col_map
964 = need_owned_permutation ?
965 Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())):
966 Teuchos::rcp(
new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
967 if (parsimonious_col_map->getGlobalNumElements() > 0) {
969 local_ordinal_type_1d_view dm2cm;
970 if (need_owned_permutation) {
971 dm2cm = local_ordinal_type_1d_view(
do_not_initialize_tag(
"dm2cm"), domain_map->getLocalNumElements());
972 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
973 for (
size_t i=0;i<domain_map->getLocalNumElements();++i)
974 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
975 Kokkos::deep_copy(dm2cm, dm2cm_host);
977 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
978 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
981 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
982 return Teuchos::null;
985 template<
typename local_ordinal_type>
986 local_ordinal_type costTRSM(
const local_ordinal_type block_size) {
987 return block_size*block_size;
990 template<
typename local_ordinal_type>
991 local_ordinal_type costGEMV(
const local_ordinal_type block_size) {
992 return 2*block_size*block_size;
995 template<
typename local_ordinal_type>
996 local_ordinal_type costTriDiagSolve(
const local_ordinal_type subline_length,
const local_ordinal_type block_size) {
997 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length-1) * costGEMV(block_size);
1000 template<
typename local_ordinal_type>
1001 local_ordinal_type costSolveSchur(
const local_ordinal_type num_parts,
1002 const local_ordinal_type num_teams,
1003 const local_ordinal_type line_length,
1004 const local_ordinal_type block_size,
1005 const local_ordinal_type n_subparts_per_part) {
1006 const local_ordinal_type subline_length = ceil(
double(line_length - (n_subparts_per_part-1) * 2) / n_subparts_per_part);
1007 if (subline_length < 1) {
1011 const local_ordinal_type p_n_lines = ceil(
double(num_parts)/num_teams);
1012 const local_ordinal_type p_n_sublines = ceil(
double(n_subparts_per_part)*num_parts/num_teams);
1013 const local_ordinal_type p_n_sublines_2 = ceil(
double(n_subparts_per_part-1)*num_parts/num_teams);
1015 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1016 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part-1)*2,block_size);
1017 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length,block_size);
1018 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1020 if (n_subparts_per_part == 1) {
1021 return p_costApplyAinv;
1023 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1026 template<
typename local_ordinal_type>
1027 local_ordinal_type getAutomaticNSubparts(
const local_ordinal_type num_parts,
1028 const local_ordinal_type num_teams,
1029 const local_ordinal_type line_length,
1030 const local_ordinal_type block_size) {
1031 local_ordinal_type n_subparts_per_part_0 = 1;
1032 local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
1033 local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0+1);
1034 while (flop_0 > flop_1) {
1036 flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0)+1);
1038 return n_subparts_per_part_0;
1041 template<
typename ArgActiveExecutionMemorySpace>
1042 struct SolveTridiagsDefaultModeAndAlgo;
1047 template<
typename MatrixType>
1048 BlockHelperDetails::PartInterface<MatrixType>
1050 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1052 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1055 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1056 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1057 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
1058 using size_type =
typename impl_type::size_type;
1060 auto bA = Teuchos::rcp_dynamic_cast<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1063 const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1064 constexpr
int vector_length = impl_type::vector_length;
1065 constexpr
int internal_vector_length = impl_type::internal_vector_length;
1067 const auto comm = A->getRowMap()->getComm();
1069 BlockHelperDetails::PartInterface<MatrixType> interf;
1071 const bool jacobi = partitions.size() == 0;
1072 const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1073 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1075 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1076 std::vector<size_idx_pair_type> partsz(nparts);
1079 for (local_ordinal_type i=0;i<nparts;++i)
1080 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1081 std::sort(partsz.begin(), partsz.end(),
1082 [] (
const size_idx_pair_type& x,
const size_idx_pair_type& y) {
1083 return x.first > y.first;
1087 local_ordinal_type n_subparts_per_part;
1088 if (n_subparts_per_part_in == -1) {
1091 using execution_space =
typename impl_type::execution_space;
1093 const int line_length = partsz[0].first;
1095 const local_ordinal_type team_size =
1096 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1097 recommended_team_size(blocksize, vector_length, internal_vector_length);
1099 const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1101 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1103 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1104 printf(
"Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1108 n_subparts_per_part = n_subparts_per_part_in;
1112 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1115 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
1117 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1118 local_ordinal_type nrows = 0;
1122 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1125 (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) <<
"The #rows implied by the local partition is not "
1126 <<
"the same as getLocalNumRows: " << nrows <<
" vs " << A_n_lclrows);
1130 std::vector<local_ordinal_type> p;
1132 interf.max_partsz = 1;
1133 interf.max_subpartsz = 0;
1134 interf.n_subparts_per_part = 1;
1135 interf.nparts = nparts;
1140 for (local_ordinal_type i=0;i<nparts;++i)
1141 p[i] = partsz[i].second;
1143 interf.max_partsz = partsz[0].first;
1145 constexpr local_ordinal_type connection_length = 2;
1146 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1147 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1149 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1150 interf.n_subparts_per_part = n_subparts_per_part;
1151 interf.nparts = nparts;
1157 interf.part2rowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0"), nparts + 1);
1158 interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
1161 interf.part2rowidx0_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1162 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1163 interf.rowidx2part_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"rowidx2part"), A_n_lclrows);
1165 interf.partptr_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"partptr_sub"), n_sub_parts_and_schur, 2);
1168 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1169 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1171 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1172 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1173 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1174 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1176 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1177 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1178 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1181 interf.row_contiguous =
true;
1183 part2rowidx0(0) = 0;
1184 part2packrowidx0(0) = 0;
1185 local_ordinal_type pack_nrows = 0;
1186 local_ordinal_type pack_nrows_sub = 0;
1188 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices (Jacobi)", Jacobi);
1192 for (local_ordinal_type i=0; i <= nparts; ++i) {
1193 part2rowidx0(i) = i;
1196 for (local_ordinal_type i=0; i < nparts; ++i) {
1200 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1202 if (ip % vector_length == 0) pack_nrows = 1;
1203 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1205 part2rowidx0_sub(0) = 0;
1206 partptr_sub(0, 0) = 0;
1208 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1209 constexpr local_ordinal_type ipnrows = 1;
1210 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1213 (full_line_length != ipnrows, std::logic_error,
1214 "In the part " << ip );
1216 constexpr local_ordinal_type connection_length = 2;
1218 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1220 (
true, std::logic_error,
1221 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1223 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1224 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1226 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1228 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1229 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1230 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1231 if (local_sub_ip != n_subparts_per_part-1) {
1232 if (local_sub_ip != 0) {
1233 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1236 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1238 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1239 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1240 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1242 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1243 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1245 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1246 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1247 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1251 if (local_sub_ip != 0) {
1252 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1255 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1257 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1259 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1261 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1262 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1268 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1269 std::cout <<
"partptr_sub = " << std::endl;
1270 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1271 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1272 std::cout << partptr_sub(i,j) <<
" ";
1274 std::cout << std::endl;
1276 std::cout <<
"partptr_sub end" << std::endl;
1280 local_ordinal_type npacks = ceil(
float(nparts)/vector_length);
1282 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1283 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1284 part2packrowidx0_sub(ip, 0) = 0;
1286 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1288 local_ordinal_type ip_min = ipack*vector_length;
1289 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1290 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1291 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1295 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1296 local_ordinal_type ip_min = ipack*vector_length;
1297 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1299 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1301 constexpr local_ordinal_type connection_length = 2;
1303 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1304 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1306 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1307 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1308 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1310 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1312 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1313 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1318 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1320 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1322 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices", indices);
1323 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1324 const auto* part = &partitions[p[ip]];
1325 const local_ordinal_type ipnrows = part->size();
1326 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1328 BlockHelperDetails::get_msg_prefix(comm)
1329 <<
"partition " << p[ip]
1330 <<
" is empty, which is not allowed.");
1332 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1335 if (ip % vector_length == 0) pack_nrows = ipnrows;
1336 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1337 const local_ordinal_type offset = partptr(ip);
1338 for (local_ordinal_type i=0;i<ipnrows;++i) {
1339 const auto lcl_row = (*part)[i];
1341 BlockHelperDetails::get_msg_prefix(comm)
1342 <<
"partitions[" << p[ip] <<
"]["
1343 << i <<
"] = " << lcl_row
1344 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
1346 lclrow(offset+i) = lcl_row;
1347 rowidx2part(offset+i) = ip;
1348 if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1349 interf.row_contiguous =
false;
1351 partptr(ip+1) = offset + ipnrows;
1353 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1354 printf(
"Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1355 printf(
"partptr(%d+1) = %d\n", ip, partptr(ip+1));
1359 part2rowidx0_sub(0) = 0;
1360 partptr_sub(0, 0) = 0;
1363 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1364 const auto* part = &partitions[p[ip]];
1365 const local_ordinal_type ipnrows = part->size();
1366 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1369 (full_line_length != ipnrows, std::logic_error,
1370 "In the part " << ip );
1372 constexpr local_ordinal_type connection_length = 2;
1374 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1376 (
true, std::logic_error,
1377 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1379 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1380 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1382 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1384 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1385 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1386 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1387 if (local_sub_ip != n_subparts_per_part-1) {
1388 if (local_sub_ip != 0) {
1389 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1392 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1394 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1395 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1396 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1398 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1399 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1401 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1402 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1403 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1407 if (local_sub_ip != 0) {
1408 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1411 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1413 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1415 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1417 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1418 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1425 local_ordinal_type npacks = ceil(
float(nparts)/vector_length);
1427 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1428 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1429 part2packrowidx0_sub(ip, 0) = 0;
1431 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1433 local_ordinal_type ip_min = ipack*vector_length;
1434 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1435 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1436 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1440 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1441 local_ordinal_type ip_min = ipack*vector_length;
1442 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1444 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1446 constexpr local_ordinal_type connection_length = 2;
1448 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1449 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1451 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1452 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1453 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1455 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1457 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1458 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1463 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1465 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1467 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1470 if (lclrow(0) != 0) interf.row_contiguous =
false;
1472 Kokkos::deep_copy(interf.partptr, partptr);
1473 Kokkos::deep_copy(interf.lclrow, lclrow);
1475 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1478 interf.part2rowidx0 = interf.partptr;
1479 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1481 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1482 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1485 IFPACK2_BLOCKHELPER_TIMER(
"Fill packptr", packptr0);
1486 local_ordinal_type npacks = ceil(
float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1488 for (local_ordinal_type ip=1;ip<=nparts;++ip)
1489 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1493 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1495 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1496 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1499 Kokkos::deep_copy(interf.packptr, packptr);
1501 local_ordinal_type npacks_per_subpart = ceil(
float(nparts)/vector_length);
1502 npacks = ceil(
float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1504 interf.packindices_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"packindices_sub"), npacks_per_subpart*n_subparts_per_part);
1505 interf.packindices_schur = local_ordinal_type_2d_view(
do_not_initialize_tag(
"packindices_schur"), npacks_per_subpart,n_subparts_per_part-1);
1507 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1508 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1512 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
1513 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1514 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1515 packindices_schur(local_pack_ip,local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1519 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1520 packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
1523 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1524 std::cout <<
"packindices_sub = " << std::endl;
1525 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1526 std::cout << packindices_sub(i) <<
" ";
1528 std::cout << std::endl;
1529 std::cout <<
"packindices_sub end" << std::endl;
1531 std::cout <<
"packindices_schur = " << std::endl;
1532 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1533 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1534 std::cout << packindices_schur(i,j) <<
" ";
1536 std::cout << std::endl;
1539 std::cout <<
"packindices_schur end" << std::endl;
1542 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1543 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1546 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1548 for (local_ordinal_type k=0;k<npacks + 1;++k)
1549 packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1551 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1552 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1554 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1562 template <
typename MatrixType>
1565 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1567 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1568 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1569 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1570 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
1576 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1579 local_ordinal_type_1d_view A_colindsub;
1582 vector_type_3d_view values;
1585 vector_type_3d_view values_schur;
1587 vector_type_4d_view e_values;
1592 size_type_1d_view diag_offsets;
1596 btdm_scalar_type_3d_view d_inv;
1598 bool is_diagonal_only;
1604 template <
typename idx_type>
1605 static KOKKOS_FORCEINLINE_FUNCTION
1606 idx_type IndexToRow (
const idx_type& ind) {
return (ind + 1) / 3; }
1609 template <
typename idx_type>
1610 static KOKKOS_FORCEINLINE_FUNCTION
1611 idx_type RowToIndex (
const idx_type& row) {
return row > 0 ? 3*row - 1 : 0; }
1613 template <
typename idx_type>
1614 static KOKKOS_FORCEINLINE_FUNCTION
1615 idx_type NumBlocks (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows - 2 : 0; }
1617 template <
typename idx_type>
1618 static KOKKOS_FORCEINLINE_FUNCTION
1619 idx_type NumBlocksSchur (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows + 2 : 0; }
1626 template<
typename MatrixType>
1629 IFPACK2_BLOCKHELPER_TIMER(
"createBlockTridiags", createBlockTridiags0);
1631 using execution_space =
typename impl_type::execution_space;
1632 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1633 using size_type =
typename impl_type::size_type;
1634 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1636 constexpr
int vector_length = impl_type::vector_length;
1640 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1643 btdm.flat_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1644 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
1645 Kokkos::parallel_scan
1646 (
"createBlockTridiags::RangePolicy::flat_td_ptr",
1647 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1648 const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
1649 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1652 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1654 if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
1655 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
1656 if (local_subpartidx % 2 == 0)
1657 update += btdm.NumBlocks(nrows);
1659 update += btdm.NumBlocksSchur(nrows);
1663 const auto nblocks = Kokkos::create_mirror_view_and_copy
1664 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
1665 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
1669 if (vector_length == 1) {
1670 btdm.pack_td_ptr = btdm.flat_td_ptr;
1674 local_ordinal_type npacks_per_subpart = 0;
1675 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1676 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1677 for (local_ordinal_type ip=1;ip<=interf.nparts;++ip)
1678 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1679 ++npacks_per_subpart;
1681 btdm.pack_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1682 const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
1684 Kokkos::parallel_for
1685 (
"createBlockTridiags::RangePolicy::pack_td_ptr",
1686 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1687 for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
1688 const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
1689 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
1691 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1692 const local_ordinal_type partidx = parti%interf.nparts;
1694 for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
1695 btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
1701 btdm.pack_td_ptr_schur = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1703 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1704 constexpr local_ordinal_type connection_length = 2;
1706 host_pack_td_ptr_schur(0,0) = 0;
1707 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1708 if (i % vector_length == 0) {
1710 host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
1711 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
1712 host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
1716 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1717 host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
1722 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1724 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1725 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1726 std::cout <<
"flat_td_ptr = " << std::endl;
1727 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1728 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1729 std::cout << host_flat_td_ptr(i,j) <<
" ";
1731 std::cout << std::endl;
1733 std::cout <<
"flat_td_ptr end" << std::endl;
1735 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1737 std::cout <<
"pack_td_ptr = " << std::endl;
1738 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1739 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1740 std::cout << host_pack_td_ptr(i,j) <<
" ";
1742 std::cout << std::endl;
1744 std::cout <<
"pack_td_ptr end" << std::endl;
1747 std::cout <<
"pack_td_ptr_schur = " << std::endl;
1748 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1749 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1750 std::cout << host_pack_td_ptr_schur(i,j) <<
" ";
1752 std::cout << std::endl;
1754 std::cout <<
"pack_td_ptr_schur end" << std::endl;
1758 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1773 template<
typename MatrixType>
1775 setTridiagsToIdentity
1776 (
const BlockTridiags<MatrixType>& btdm,
1777 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1780 using execution_space =
typename impl_type::execution_space;
1781 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1782 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1784 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1785 const local_ordinal_type blocksize = btdm.values.extent(1);
1788 const int vector_length = impl_type::vector_length;
1789 const int internal_vector_length = impl_type::internal_vector_length;
1791 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
1792 using internal_vector_type =
typename impl_type::internal_vector_type;
1793 using internal_vector_type_4d_view =
1794 typename impl_type::internal_vector_type_4d_view;
1796 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1797 const internal_vector_type_4d_view values
1798 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1799 btdm.values.extent(0),
1800 btdm.values.extent(1),
1801 btdm.values.extent(2),
1802 vector_length/internal_vector_length);
1803 const local_ordinal_type vector_loop_size = values.extent(3);
1804 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1805 local_ordinal_type total_team_size(0);
1806 if (blocksize <= 5) total_team_size = 32;
1807 else if (blocksize <= 9) total_team_size = 64;
1808 else if (blocksize <= 12) total_team_size = 96;
1809 else if (blocksize <= 16) total_team_size = 128;
1810 else if (blocksize <= 20) total_team_size = 160;
1811 else total_team_size = 160;
1812 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1813 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1814 #elif defined(KOKKOS_ENABLE_HIP)
1819 local_ordinal_type total_team_size(0);
1820 if (blocksize <= 5) total_team_size = 32;
1821 else if (blocksize <= 9) total_team_size = 64;
1822 else if (blocksize <= 12) total_team_size = 96;
1823 else if (blocksize <= 16) total_team_size = 128;
1824 else if (blocksize <= 20) total_team_size = 160;
1825 else total_team_size = 160;
1826 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1827 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1828 #elif defined(KOKKOS_ENABLE_SYCL)
1830 local_ordinal_type total_team_size(0);
1831 if (blocksize <= 5) total_team_size = 32;
1832 else if (blocksize <= 9) total_team_size = 64;
1833 else if (blocksize <= 12) total_team_size = 96;
1834 else if (blocksize <= 16) total_team_size = 128;
1835 else if (blocksize <= 20) total_team_size = 160;
1836 else total_team_size = 160;
1837 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1838 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1841 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1843 Kokkos::parallel_for
1844 (
"setTridiagsToIdentity::TeamPolicy",
1845 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1846 const local_ordinal_type k = member.league_rank();
1847 const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
1848 const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
1850 const local_ordinal_type diff = iend - ibeg;
1851 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1852 const btdm_scalar_type one(1);
1853 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
1854 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](
const local_ordinal_type &ii) {
1855 const local_ordinal_type i = ibeg + ii*3;
1856 for (local_ordinal_type j=0;j<blocksize;++j) {
1857 values(i,j,j,v) = one;
1868 template<
typename MatrixType>
1871 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1872 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1875 const bool overlap_communication_and_computation,
1876 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
1878 bool use_fused_jacobi) {
1879 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SymbolicPhase", SymbolicPhase);
1883 using execution_space =
typename impl_type::execution_space;
1884 using host_execution_space =
typename impl_type::host_execution_space;
1886 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1887 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1888 using size_type =
typename impl_type::size_type;
1889 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1890 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1891 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1892 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1893 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
1894 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1895 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
1897 constexpr
int vector_length = impl_type::vector_length;
1899 const auto comm = A->getRowMap()->getComm();
1901 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A);
1902 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A);
1904 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
1906 const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows()/g->getLocalNumRows();
1909 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1910 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1911 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1912 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1913 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1915 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1917 Kokkos::View<local_ordinal_type*,host_execution_space> col2row(
"col2row", A->getLocalNumCols());
1923 const auto rowmap = g->getRowMap();
1924 const auto colmap = g->getColMap();
1925 const auto dommap = g->getDomainMap();
1926 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1928 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1929 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1930 Kokkos::parallel_for
1931 (
"performSymbolicPhase::RangePolicy::col2row",
1932 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1933 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1935 if (dommap->isNodeGlobalElement(gid)) {
1936 const local_ordinal_type lc = colmap->getLocalElement(gid);
1937 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1939 BlockHelperDetails::get_msg_prefix(comm) <<
"GID " << gid
1940 <<
" gives an invalid local column.");
1950 const auto local_graph = g->getLocalGraphHost();
1951 const auto local_graph_rowptr = local_graph.row_map;
1952 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
1953 const auto local_graph_colidx = local_graph.entries;
1957 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
1959 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1960 Kokkos::parallel_for
1961 (
"performSymbolicPhase::RangePolicy::lclrow2idx",
1962 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1963 lclrow2idx[lclrow(i)] = i;
1969 typename sum_reducer_type::value_type sum_reducer_value;
1971 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1972 Kokkos::parallel_reduce
1975 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
typename sum_reducer_type::value_type &update) {
1977 const local_ordinal_type ri0 = lclrow2idx[lr];
1978 const local_ordinal_type pi0 = rowidx2part(ri0);
1979 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1980 const local_ordinal_type lc = local_graph_colidx(j);
1981 const local_ordinal_type lc2r = col2row[lc];
1982 bool incr_R =
false;
1984 if (lc2r == (local_ordinal_type) -1) {
1988 const local_ordinal_type ri = lclrow2idx[lc2r];
1989 const local_ordinal_type pi = rowidx2part(ri);
1997 if (ri0 + 1 >= ri && ri0 <= ri + 1)
2003 if (lc < nrows) ++update.v[1];
2007 }, sum_reducer_type(sum_reducer_value));
2009 size_type D_nnz = sum_reducer_value.v[0];
2010 size_type R_nnz_owned = sum_reducer_value.v[1];
2011 size_type R_nnz_remote = sum_reducer_value.v[2];
2013 if (!overlap_communication_and_computation) {
2014 R_nnz_owned += R_nnz_remote;
2020 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2022 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
2023 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2025 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2029 const local_ordinal_type nparts = partptr.extent(0) - 1;
2032 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2033 Kokkos::parallel_for
2034 (
"performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2035 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
2036 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2037 local_ordinal_type offset = 0;
2038 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
2039 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2041 const local_ordinal_type lr0 = lclrow(ri0);
2042 const size_type j0 = local_graph_rowptr(lr0);
2043 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
2044 const local_ordinal_type lc = local_graph_colidx(j);
2045 const local_ordinal_type lc2r = col2row[lc];
2046 if (lc2r == (local_ordinal_type) -1)
continue;
2047 const local_ordinal_type ri = lclrow2idx[lc2r];
2048 const local_ordinal_type pi = rowidx2part(ri);
2049 if (pi != pi0)
continue;
2050 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
2051 const local_ordinal_type row_entry = j - j0;
2052 D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
2057 #if defined(BLOCKTRIDICONTAINER_DEBUG)
2058 for (
size_t i=0;i<D_A_colindsub.extent(0);++i)
2061 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2065 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
2066 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2067 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
2069 if (interf.n_subparts_per_part > 1) {
2070 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
2071 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2072 btdm.values_schur = vector_type_3d_view(
"btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2075 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2081 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
2082 amd.A_colindsub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub"), R_nnz_owned);
2084 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2085 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2087 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2088 amd.A_colindsub_remote = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub_remote"), R_nnz_remote);
2090 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2091 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2094 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
2095 Kokkos::parallel_for
2096 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2097 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
2098 const local_ordinal_type ri0 = lclrow2idx[lr];
2099 const local_ordinal_type pi0 = rowidx2part(ri0);
2100 const size_type j0 = local_graph_rowptr(lr);
2101 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2102 const local_ordinal_type lc = local_graph_colidx(j);
2103 const local_ordinal_type lc2r = col2row[lc];
2104 if (lc2r != (local_ordinal_type) -1) {
2105 const local_ordinal_type ri = lclrow2idx[lc2r];
2106 const local_ordinal_type pi = rowidx2part(ri);
2107 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2112 if (!overlap_communication_and_computation || lc < nrows) {
2115 ++R_rowptr_remote(lr);
2124 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
2125 Kokkos::parallel_scan
2126 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2127 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
2128 update_type &update,
2129 const bool &
final) {
2131 val.v[0] = R_rowptr(lr);
2132 if (overlap_communication_and_computation)
2133 val.v[1] = R_rowptr_remote(lr);
2136 R_rowptr(lr) = update.v[0];
2137 if (overlap_communication_and_computation)
2138 R_rowptr_remote(lr) = update.v[1];
2141 const local_ordinal_type ri0 = lclrow2idx[lr];
2142 const local_ordinal_type pi0 = rowidx2part(ri0);
2144 size_type cnt_rowptr = R_rowptr(lr);
2145 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
2147 const size_type j0 = local_graph_rowptr(lr);
2148 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2149 const local_ordinal_type lc = local_graph_colidx(j);
2150 const local_ordinal_type lc2r = col2row[lc];
2151 if (lc2r != (local_ordinal_type) -1) {
2152 const local_ordinal_type ri = lclrow2idx[lc2r];
2153 const local_ordinal_type pi = rowidx2part(ri);
2154 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2157 const local_ordinal_type row_entry = j - j0;
2158 if (!overlap_communication_and_computation || lc < nrows)
2159 R_A_colindsub(cnt_rowptr++) = row_entry;
2161 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2169 Kokkos::deep_copy(amd.rowptr, R_rowptr);
2170 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2171 if (overlap_communication_and_computation) {
2173 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2174 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2178 if (hasBlockCrsMatrix)
2179 amd.tpetra_values = (
const_cast<block_crs_matrix_type*
>(A_bcrs.get())->getValuesDeviceNonConst());
2181 amd.tpetra_values = (
const_cast<crs_matrix_type*
>(A_crs.get()))->getLocalValuesDevice (Tpetra::Access::ReadWrite);
2187 if (interf.n_subparts_per_part > 1)
2188 btdm.e_values = vector_type_4d_view(
"btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2198 if(BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix)
2200 bool is_async_importer_active = !async_importer.is_null();
2201 local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2202 bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2203 BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2207 if(use_fused_jacobi) {
2208 btdm.d_inv = btdm_scalar_type_3d_view(
do_not_initialize_tag(
"btdm.d_inv"), interf.nparts, blocksize, blocksize);
2209 auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
2210 auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
2211 btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
2213 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2220 template<
typename ArgActiveExecutionMemorySpace>
2225 typedef KB::Mode::Serial mode_type;
2226 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2227 typedef KB::Algo::Level3::CompactMKL algo_type;
2229 typedef KB::Algo::Level3::Blocked algo_type;
2231 static int recommended_team_size(
const int ,
2239 #if defined(KOKKOS_ENABLE_CUDA)
2240 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
2241 const int vector_length,
2242 const int internal_vector_length) {
2243 const int vector_size = vector_length/internal_vector_length;
2244 int total_team_size(0);
2245 if (blksize <= 5) total_team_size = 32;
2246 else if (blksize <= 9) total_team_size = 32;
2247 else if (blksize <= 12) total_team_size = 96;
2248 else if (blksize <= 16) total_team_size = 128;
2249 else if (blksize <= 20) total_team_size = 160;
2250 else total_team_size = 160;
2251 return 2*total_team_size/vector_size;
2254 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2255 typedef KB::Mode::Team mode_type;
2256 typedef KB::Algo::Level3::Unblocked algo_type;
2257 static int recommended_team_size(
const int blksize,
2258 const int vector_length,
2259 const int internal_vector_length) {
2260 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2264 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2265 typedef KB::Mode::Team mode_type;
2266 typedef KB::Algo::Level3::Unblocked algo_type;
2267 static int recommended_team_size(
const int blksize,
2268 const int vector_length,
2269 const int internal_vector_length) {
2270 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2275 #if defined(KOKKOS_ENABLE_HIP)
2276 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(
const int blksize,
2277 const int vector_length,
2278 const int internal_vector_length) {
2279 const int vector_size = vector_length/internal_vector_length;
2280 int total_team_size(0);
2281 if (blksize <= 5) total_team_size = 32;
2282 else if (blksize <= 9) total_team_size = 32;
2283 else if (blksize <= 12) total_team_size = 96;
2284 else if (blksize <= 16) total_team_size = 128;
2285 else if (blksize <= 20) total_team_size = 160;
2286 else total_team_size = 160;
2287 return 2*total_team_size/vector_size;
2290 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2291 typedef KB::Mode::Team mode_type;
2292 typedef KB::Algo::Level3::Unblocked algo_type;
2293 static int recommended_team_size(
const int blksize,
2294 const int vector_length,
2295 const int internal_vector_length) {
2296 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2300 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2301 typedef KB::Mode::Team mode_type;
2302 typedef KB::Algo::Level3::Unblocked algo_type;
2303 static int recommended_team_size(
const int blksize,
2304 const int vector_length,
2305 const int internal_vector_length) {
2306 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2311 #if defined(KOKKOS_ENABLE_SYCL)
2312 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(
const int blksize,
2313 const int vector_length,
2314 const int internal_vector_length) {
2315 const int vector_size = vector_length/internal_vector_length;
2316 int total_team_size(0);
2317 if (blksize <= 5) total_team_size = 32;
2318 else if (blksize <= 9) total_team_size = 32;
2319 else if (blksize <= 12) total_team_size = 96;
2320 else if (blksize <= 16) total_team_size = 128;
2321 else if (blksize <= 20) total_team_size = 160;
2322 else total_team_size = 160;
2323 return 2*total_team_size/vector_size;
2326 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2327 typedef KB::Mode::Team mode_type;
2328 typedef KB::Algo::Level3::Unblocked algo_type;
2329 static int recommended_team_size(
const int blksize,
2330 const int vector_length,
2331 const int internal_vector_length) {
2332 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2336 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2337 typedef KB::Mode::Team mode_type;
2338 typedef KB::Algo::Level3::Unblocked algo_type;
2339 static int recommended_team_size(
const int blksize,
2340 const int vector_length,
2341 const int internal_vector_length) {
2342 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2347 template<
typename impl_type,
typename WWViewType>
2348 KOKKOS_INLINE_FUNCTION
2350 solveMultiVector(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2351 const typename impl_type::local_ordinal_type &,
2352 const typename impl_type::local_ordinal_type &i0,
2353 const typename impl_type::local_ordinal_type &r0,
2354 const typename impl_type::local_ordinal_type &nrows,
2355 const typename impl_type::local_ordinal_type &v,
2356 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2357 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2358 const WWViewType &WW,
2359 const bool skip_first_pass=
false) {
2360 using execution_space =
typename impl_type::execution_space;
2361 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2362 using member_type =
typename team_policy_type::member_type;
2363 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2365 typedef SolveTridiagsDefaultModeAndAlgo
2366 <
typename execution_space::memory_space> default_mode_and_algo_type;
2368 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2369 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2371 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2374 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2375 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2378 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2379 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2382 local_ordinal_type i = i0, r = r0;
2387 if (skip_first_pass) {
2390 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2391 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2392 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2393 KB::Trsm<member_type,
2394 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2395 default_mode_type,default_algo_type>
2396 ::invoke(member, one, A, X2);
2397 X1.assign_data( X2.data() );
2401 KB::Trsm<member_type,
2402 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2403 default_mode_type,default_algo_type>
2404 ::invoke(member, one, A, X1);
2405 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2406 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2407 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2408 member.team_barrier();
2409 KB::Gemm<member_type,
2410 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2411 default_mode_type,default_algo_type>
2412 ::invoke(member, -one, A, X1, one, X2);
2413 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2414 KB::Trsm<member_type,
2415 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2416 default_mode_type,default_algo_type>
2417 ::invoke(member, one, A, X2);
2418 X1.assign_data( X2.data() );
2423 KB::Trsm<member_type,
2424 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2425 default_mode_type,default_algo_type>
2426 ::invoke(member, one, A, X1);
2427 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2429 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2430 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2431 member.team_barrier();
2432 KB::Gemm<member_type,
2433 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2434 default_mode_type,default_algo_type>
2435 ::invoke(member, -one, A, X1, one, X2);
2437 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2438 KB::Trsm<member_type,
2439 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2440 default_mode_type,default_algo_type>
2441 ::invoke(member, one, A, X2);
2442 X1.assign_data( X2.data() );
2446 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2447 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2448 ::invoke(member, X1, W);
2449 member.team_barrier();
2450 KB::Gemm<member_type,
2451 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2452 default_mode_type,default_algo_type>
2453 ::invoke(member, one, A, W, zero, X1);
2458 template<
typename impl_type,
typename WWViewType,
typename XViewType>
2459 KOKKOS_INLINE_FUNCTION
2461 solveSingleVectorNew(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2462 const typename impl_type::local_ordinal_type &blocksize,
2463 const typename impl_type::local_ordinal_type &i0,
2464 const typename impl_type::local_ordinal_type &r0,
2465 const typename impl_type::local_ordinal_type &nrows,
2466 const typename impl_type::local_ordinal_type &v,
2467 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2468 const XViewType &X_internal_vector_values,
2469 const WWViewType &WW) {
2470 using execution_space =
typename impl_type::execution_space;
2473 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2475 typedef SolveTridiagsDefaultModeAndAlgo
2476 <
typename execution_space::memory_space> default_mode_and_algo_type;
2478 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2479 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2481 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2484 auto A = D_internal_vector_values.data();
2485 auto X = X_internal_vector_values.data();
2488 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2489 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2493 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2494 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
2495 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
2496 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2497 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
2506 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2507 (default_mode_type,default_algo_type,
2510 blocksize,blocksize,
2515 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2516 member.team_barrier();
2517 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2518 (default_mode_type,default_algo_type,
2520 blocksize, blocksize,
2522 A+2*astep, as0, as1,
2526 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2527 (default_mode_type,default_algo_type,
2530 blocksize,blocksize,
2532 A+3*astep, as0, as1,
2540 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2541 (default_mode_type,default_algo_type,
2544 blocksize, blocksize,
2549 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2551 member.team_barrier();
2552 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2553 (default_mode_type,default_algo_type,
2555 blocksize, blocksize,
2557 A+1*astep, as0, as1,
2561 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2562 (default_mode_type,default_algo_type,
2565 blocksize, blocksize,
2574 const local_ordinal_type ws0 = WW.stride_0();
2575 auto W = WW.data() + v;
2576 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2578 member, blocksize, X, xs0, W, ws0);
2579 member.team_barrier();
2580 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2581 (default_mode_type,default_algo_type,
2583 blocksize, blocksize,
2592 template<
typename local_ordinal_type,
typename ViewType>
2593 void writeBTDValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2594 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2595 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2596 std::ofstream myfile;
2597 myfile.open (fileName);
2599 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2600 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2601 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2602 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2604 const local_ordinal_type block_size = scalar_values.extent(1);
2606 const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
2607 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2609 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2611 myfile <<
"%%MatrixMarket matrix coordinate real general"<< std::endl;
2612 myfile <<
"%%nnz = " << nnz;
2613 myfile <<
" block size = " << block_size;
2614 myfile <<
" number of blocks = " << n_blocks;
2615 myfile <<
" number of parts = " << n_parts;
2616 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2617 myfile <<
" number of rows = " << n_rows ;
2618 myfile <<
" number of cols = " << n_rows;
2619 myfile <<
" number of packs = " << n_packs << std::endl;
2621 myfile << n_rows <<
" " << n_rows <<
" " << nnz << std::setprecision(9) << std::endl;
2623 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2624 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2625 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2626 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2627 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2628 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2629 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2631 if (i_block_in_part % 3 == 0) {
2632 current_row_offset = i_block_in_part/3 * block_size;
2633 current_col_offset = i_block_in_part/3 * block_size;
2635 else if (i_block_in_part % 3 == 1) {
2636 current_row_offset = (i_block_in_part-1)/3 * block_size;
2637 current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
2639 else if (i_block_in_part % 3 == 2) {
2640 current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
2641 current_col_offset = (i_block_in_part-2)/3 * block_size;
2643 current_row_offset += current_part_idx * n_rows_per_part;
2644 current_col_offset += current_part_idx * n_rows_per_part;
2645 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2646 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2647 current_row = current_row_offset + i_in_block + 1;
2648 current_col = current_col_offset + j_in_block + 1;
2649 myfile << current_row <<
" " << current_col <<
" " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2660 template<
typename local_ordinal_type,
typename ViewType>
2661 void write4DMultiVectorValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2662 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2663 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2664 std::ofstream myfile;
2665 myfile.open (fileName);
2667 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2668 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2669 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2671 const local_ordinal_type block_size = scalar_values.extent(1);
2672 const local_ordinal_type n_cols = scalar_values.extent(2);
2674 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2675 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2677 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2680 myfile <<
"%%MatrixMarket matrix array real general"<< std::endl;
2681 myfile <<
"%%block size = " << block_size;
2682 myfile <<
" number of blocks = " << n_blocks;
2683 myfile <<
" number of parts = " << n_parts;
2684 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2685 myfile <<
" number of rows = " << n_rows ;
2686 myfile <<
" number of cols = " << n_cols;
2687 myfile <<
" number of packs = " << n_packs << std::endl;
2689 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2691 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2692 (void) current_row_offset;
2693 (void) current_part_idx;
2694 for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {
2695 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2696 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2697 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2698 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2699 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2701 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2703 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2704 myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2714 template<
typename local_ordinal_type,
typename ViewType>
2715 void write5DMultiVectorValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2716 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2717 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2718 std::ofstream myfile;
2719 myfile.open (fileName);
2721 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2722 const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
2723 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2725 const local_ordinal_type block_size = scalar_values.extent(2);
2726 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2727 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2729 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2730 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2732 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2734 myfile <<
"%%MatrixMarket matrix array real general"<< std::endl;
2735 myfile <<
"%%block size = " << block_size;
2736 myfile <<
" number of blocks = " << n_blocks;
2737 myfile <<
" number of parts = " << n_parts;
2738 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2739 myfile <<
" number of rows = " << n_rows ;
2740 myfile <<
" number of cols = " << n_cols;
2741 myfile <<
" number of packs = " << n_packs << std::endl;
2743 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2745 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2746 (void) current_row_offset;
2747 (void) current_part_idx;
2748 for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
2749 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2750 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2751 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2752 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2753 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2754 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2756 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(1))
2758 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2759 myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2770 template<
typename local_ordinal_type,
typename member_type,
typename ViewType1,
typename ViewType2>
2771 KOKKOS_INLINE_FUNCTION
2773 copy3DView(
const member_type &member,
const ViewType1 &view1,
const ViewType2 &view2) {
2786 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2788 template<
typename MatrixType,
int ScratchLevel>
2789 struct ExtractAndFactorizeTridiags {
2791 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2793 using execution_space =
typename impl_type::execution_space;
2794 using memory_space =
typename impl_type::memory_space;
2796 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2799 using magnitude_type =
typename impl_type::magnitude_type;
2801 using row_matrix_type =
typename impl_type::tpetra_row_matrix_type;
2802 using crs_graph_type =
typename impl_type::tpetra_crs_graph_type;
2804 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2805 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
2807 using size_type_2d_view =
typename impl_type::size_type_2d_view;
2808 using impl_scalar_type_1d_view_tpetra =
typename impl_type::impl_scalar_type_1d_view_tpetra;
2810 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2811 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2812 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2813 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
2814 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
2815 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
2816 using btdm_scalar_type_2d_view =
typename impl_type::btdm_scalar_type_2d_view;
2817 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
2818 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
2819 using btdm_scalar_type_5d_view =
typename impl_type::btdm_scalar_type_5d_view;
2820 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2821 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2822 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
2823 using local_crs_graph_type =
typename impl_type::local_crs_graph_type;
2824 using colinds_view =
typename local_crs_graph_type::entries_type;
2826 using internal_vector_type =
typename impl_type::internal_vector_type;
2827 static constexpr
int vector_length = impl_type::vector_length;
2828 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
2829 static_assert(vector_length >= internal_vector_length,
"Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2830 static_assert(vector_length % internal_vector_length == 0,
"Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2835 static constexpr
int half_vector_length = impl_type::half_vector_length;
2838 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2839 using member_type =
typename team_policy_type::member_type;
2843 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2844 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2845 const local_ordinal_type max_partsz;
2847 using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2848 ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2849 ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2850 ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2852 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2853 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2854 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2855 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2856 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2857 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2858 const Unmanaged<btdm_scalar_type_3d_view> d_inv;
2859 const Unmanaged<size_type_1d_view> diag_offsets;
2861 const local_ordinal_type blocksize, blocksize_square;
2863 const magnitude_type tiny;
2864 const local_ordinal_type vector_loop_size;
2866 bool hasBlockCrsMatrix;
2869 ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
2870 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2873 const magnitude_type& tiny_) :
2875 partptr(interf_.partptr),
2876 lclrow(interf_.lclrow),
2877 packptr(interf_.packptr),
2878 packindices_sub(interf_.packindices_sub),
2879 packptr_sub(interf_.packptr_sub),
2880 partptr_sub(interf_.partptr_sub),
2881 part2packrowidx0_sub(interf_.part2packrowidx0_sub),
2882 packindices_schur(interf_.packindices_schur),
2883 max_partsz(interf_.max_partsz),
2885 pack_td_ptr(btdm_.pack_td_ptr),
2886 flat_td_ptr(btdm_.flat_td_ptr),
2887 pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
2888 A_colindsub(btdm_.A_colindsub),
2889 internal_vector_values((internal_vector_type*)btdm_.values.data(),
2890 btdm_.values.extent(0),
2891 btdm_.values.extent(1),
2892 btdm_.values.extent(2),
2893 vector_length/internal_vector_length),
2894 internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
2895 btdm_.values_schur.extent(0),
2896 btdm_.values_schur.extent(1),
2897 btdm_.values_schur.extent(2),
2898 vector_length/internal_vector_length),
2899 e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
2900 btdm_.e_values.extent(0),
2901 btdm_.e_values.extent(1),
2902 btdm_.e_values.extent(2),
2903 btdm_.e_values.extent(3),
2904 vector_length/internal_vector_length),
2905 scalar_values((btdm_scalar_type*)btdm_.values.data(),
2906 btdm_.values.extent(0),
2907 btdm_.values.extent(1),
2908 btdm_.values.extent(2),
2910 scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
2911 btdm_.values_schur.extent(0),
2912 btdm_.values_schur.extent(1),
2913 btdm_.values_schur.extent(2),
2915 e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
2916 btdm_.e_values.extent(0),
2917 btdm_.e_values.extent(1),
2918 btdm_.e_values.extent(2),
2919 btdm_.e_values.extent(3),
2922 diag_offsets(btdm_.diag_offsets),
2923 blocksize(btdm_.values.extent(1)),
2924 blocksize_square(blocksize*blocksize),
2927 vector_loop_size(vector_length/internal_vector_length) {
2928 using crs_matrix_type =
typename impl_type::tpetra_crs_matrix_type;
2929 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
2931 auto A_crs = Teuchos::rcp_dynamic_cast<
const crs_matrix_type>(A_);
2932 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const block_crs_matrix_type>(A_);
2934 hasBlockCrsMatrix = ! A_bcrs.is_null ();
2936 A_block_rowptr = G_->getLocalGraphDevice().row_map;
2937 if (hasBlockCrsMatrix) {
2938 A_values =
const_cast<block_crs_matrix_type*
>(A_bcrs.get())->getValuesDeviceNonConst();
2941 A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2942 A_values = A_crs->getLocalValuesDevice (Tpetra::Access::ReadOnly);
2948 KOKKOS_INLINE_FUNCTION
2950 extract(local_ordinal_type partidx,
2951 local_ordinal_type local_subpartidx,
2952 local_ordinal_type npacks)
const {
2953 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2954 printf(
"extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2956 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2957 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2958 local_ordinal_type kfs[vector_length] = {};
2959 local_ordinal_type ri0[vector_length] = {};
2960 local_ordinal_type nrows[vector_length] = {};
2962 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2963 kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
2964 ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
2965 nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
2966 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2967 printf(
"kfs[%d] = %d;\n", vi, kfs[vi]);
2968 printf(
"ri0[%d] = %d;\n", vi, ri0[vi]);
2969 printf(
"nrows[%d] = %d;\n", vi, nrows[vi]);
2972 local_ordinal_type tr_min = 0;
2973 local_ordinal_type tr_max = nrows[0];
2974 if (local_subpartidx % 2 == 1) {
2978 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2979 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2981 for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
2982 for (local_ordinal_type e=0;e<3;++e) {
2983 if (hasBlockCrsMatrix) {
2984 const impl_scalar_type* block[vector_length] = {};
2985 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2986 const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2988 block[vi] = &A_values(Aj*blocksize_square);
2990 const size_type pi = kps + j;
2991 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2992 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2995 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2996 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2997 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2998 auto& v = internal_vector_values(pi, ii, jj, 0);
2999 for (local_ordinal_type vi=0;vi<npacks;++vi) {
3000 v[vi] =
static_cast<btdm_scalar_type
>(block[vi][idx]);
3006 const size_type pi = kps + j;
3008 for (local_ordinal_type vi=0;vi<npacks;++vi) {
3009 const size_type Aj_c = A_colindsub(kfs[vi] + j);
3011 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
3012 auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr)*blocksize + ii);
3014 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3015 scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c*blocksize + jj);
3021 if (nrows[0] == 1)
break;
3022 if (local_subpartidx % 2 == 0) {
3023 if (e == 1 && (tr == 0 || tr+1 == nrows[0]))
break;
3024 for (local_ordinal_type vi=1;vi<npacks;++vi) {
3025 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
3032 if (e == 0 && (tr == -1 || tr == nrows[0]))
break;
3033 for (local_ordinal_type vi=1;vi<npacks;++vi) {
3034 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3044 KOKKOS_INLINE_FUNCTION
3046 extract(
const member_type &member,
3047 const local_ordinal_type &partidxbeg,
3048 local_ordinal_type local_subpartidx,
3049 const local_ordinal_type &npacks,
3050 const local_ordinal_type &vbeg)
const {
3051 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3052 printf(
"extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3054 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3055 local_ordinal_type kfs_vals[internal_vector_length] = {};
3056 local_ordinal_type ri0_vals[internal_vector_length] = {};
3057 local_ordinal_type nrows_vals[internal_vector_length] = {};
3059 const size_type kps = pack_td_ptr(partidxbeg,local_subpartidx);
3060 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3061 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,local_subpartidx);
3062 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,0);
3063 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,1) - ri0_vals[vi];
3064 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3065 printf(
"kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3066 printf(
"ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3067 printf(
"nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3071 local_ordinal_type j_vals[internal_vector_length] = {};
3073 local_ordinal_type tr_min = 0;
3074 local_ordinal_type tr_max = nrows_vals[0];
3075 if (local_subpartidx % 2 == 1) {
3079 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3080 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3082 for (local_ordinal_type tr=tr_min;tr<tr_max;++tr) {
3083 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3084 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3085 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows+1)) {
3086 auto &j = j_vals[vi];
3087 const local_ordinal_type kfs = kfs_vals[vi];
3088 const local_ordinal_type ri0 = ri0_vals[vi];
3089 local_ordinal_type lbeg, lend;
3090 if (local_subpartidx % 2 == 0) {
3091 lbeg = (tr == tr_min ? 1 : 0);
3092 lend = (tr == nrows - 1 ? 2 : 3);
3101 else if (tr == nrows) {
3106 if (hasBlockCrsMatrix) {
3107 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3108 const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3109 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
3110 const size_type pi = kps + j;
3111 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3112 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3114 Kokkos::parallel_for
3115 (Kokkos::TeamThreadRange(member,blocksize),
3116 [&](
const local_ordinal_type &ii) {
3117 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3118 scalar_values(pi, ii, jj, v) =
static_cast<btdm_scalar_type
>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
3124 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3125 const size_type Aj_c = A_colindsub(kfs + j);
3126 const size_type pi = kps + j;
3127 Kokkos::parallel_for
3128 (Kokkos::TeamThreadRange(member,blocksize),
3129 [&](
const local_ordinal_type &ii) {
3130 auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr)*blocksize + ii);
3131 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3132 scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c*blocksize + jj);
3142 template<
typename AAViewType,
3143 typename WWViewType>
3144 KOKKOS_INLINE_FUNCTION
3146 factorize_subline(
const member_type &member,
3147 const local_ordinal_type &i0,
3148 const local_ordinal_type &nrows,
3149 const local_ordinal_type &v,
3150 const AAViewType &AA,
3151 const WWViewType &WW)
const {
3153 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3154 <
typename execution_space::memory_space> default_mode_and_algo_type;
3156 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3157 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3160 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3162 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3163 printf(
"i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3167 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3169 default_mode_type,KB::Algo::LU::Unblocked>
3170 ::invoke(member, A , tiny);
3175 local_ordinal_type i = i0;
3176 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
3177 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3178 printf(
"tr = %d, i = %d;\n", tr, i);
3180 B.assign_data( &AA(i+1,0,0,v) );
3181 KB::Trsm<member_type,
3182 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3183 default_mode_type,default_algo_type>
3184 ::invoke(member, one, A, B);
3185 C.assign_data( &AA(i+2,0,0,v) );
3186 KB::Trsm<member_type,
3187 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3188 default_mode_type,default_algo_type>
3189 ::invoke(member, one, A, C);
3190 A.assign_data( &AA(i+3,0,0,v) );
3192 member.team_barrier();
3193 KB::Gemm<member_type,
3194 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3195 default_mode_type,default_algo_type>
3196 ::invoke(member, -one, C, B, one, A);
3198 default_mode_type,KB::Algo::LU::Unblocked>
3199 ::invoke(member, A, tiny);
3203 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3204 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
3205 ::invoke(member, A, W);
3206 KB::SetIdentity<member_type,default_mode_type>
3207 ::invoke(member, A);
3208 member.team_barrier();
3209 KB::Trsm<member_type,
3210 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3211 default_mode_type,default_algo_type>
3212 ::invoke(member, one, W, A);
3213 KB::Trsm<member_type,
3214 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3215 default_mode_type,default_algo_type>
3216 ::invoke(member, one, W, A);
3222 struct ExtractAndFactorizeSubLineTag {};
3223 struct ExtractAndFactorizeFusedJacobiTag {};
3224 struct ExtractBCDTag {};
3225 struct ComputeETag {};
3226 struct ComputeSchurTag {};
3227 struct FactorizeSchurTag {};
3229 KOKKOS_INLINE_FUNCTION
3231 operator() (
const ExtractAndFactorizeSubLineTag &,
const member_type &member)
const {
3233 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3235 const local_ordinal_type subpartidx = packptr_sub(packidx);
3236 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3237 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3238 const local_ordinal_type partidx = subpartidx%n_parts;
3240 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3241 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3242 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3244 internal_vector_scratch_type_3d_view
3245 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3247 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3248 printf(
"rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3249 printf(
"vector_loop_size = %d\n", vector_loop_size);
3252 if (vector_loop_size == 1) {
3253 extract(partidx, local_subpartidx, npacks);
3254 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3256 Kokkos::parallel_for
3257 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3258 [&](
const local_ordinal_type &v) {
3259 const local_ordinal_type vbeg = v*internal_vector_length;
3260 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3261 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3264 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3267 member.team_barrier();
3268 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3273 KOKKOS_INLINE_FUNCTION
3275 operator() (
const ExtractAndFactorizeFusedJacobiTag&,
const member_type &member)
const {
3276 using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
3277 using default_mode_type =
typename default_mode_and_algo_type::mode_type;
3278 using default_algo_type =
typename default_mode_and_algo_type::algo_type;
3281 btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3282 btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), half_vector_length, blocksize, blocksize);
3283 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3284 const local_ordinal_type nrows = lclrow.extent(0);
3285 Kokkos::parallel_for
3286 (Kokkos::ThreadVectorRange(member, half_vector_length),
3287 [&](
const local_ordinal_type &v) {
3288 local_ordinal_type row = member.league_rank() * half_vector_length + v;
3290 auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
3291 auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
3294 const impl_scalar_type* A_diag = A_values.data() + diag_offsets(row);
3297 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3300 W1.data()[i] = A_diag[i];
3303 KB::SetIdentity<member_type,default_mode_type>
3304 ::invoke(member, W2);
3310 KB::SetIdentity<member_type,default_mode_type>
3311 ::invoke(member, W1);
3313 member.team_barrier();
3315 KB::LU<member_type, default_mode_type,KB::Algo::LU::Unblocked>
3316 ::invoke(member, W1, tiny);
3317 member.team_barrier();
3318 KB::Trsm<member_type,
3319 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3320 default_mode_type,default_algo_type>
3321 ::invoke(member, one, W1, W2);
3322 KB::Trsm<member_type,
3323 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3324 default_mode_type,default_algo_type>
3325 ::invoke(member, one, W1, W2);
3326 member.team_barrier();
3328 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
3331 auto d_inv_block = &d_inv(row, 0, 0);
3332 d_inv_block[i] = W2.data()[i];
3338 KOKKOS_INLINE_FUNCTION
3340 operator() (
const ExtractBCDTag &,
const member_type &member)
const {
3342 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3343 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3344 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3346 const local_ordinal_type subpartidx = packptr_sub(packidx);
3347 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3348 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3349 const local_ordinal_type partidx = subpartidx%n_parts;
3351 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3355 if (vector_loop_size == 1) {
3356 extract(partidx, local_subpartidx, npacks);
3359 Kokkos::parallel_for
3360 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3361 [&](
const local_ordinal_type &v) {
3362 const local_ordinal_type vbeg = v*internal_vector_length;
3363 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3364 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3365 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3368 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3372 member.team_barrier();
3374 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3375 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
3377 const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3378 const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3380 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3381 printf(
"Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3385 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3386 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3388 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3389 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3393 KOKKOS_INLINE_FUNCTION
3395 operator() (
const ComputeETag &,
const member_type &member)
const {
3397 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3399 const local_ordinal_type subpartidx = packptr_sub(packidx);
3400 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3401 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3402 const local_ordinal_type partidx = subpartidx%n_parts;
3404 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3405 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3406 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3407 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3408 const local_ordinal_type num_vectors = blocksize;
3412 internal_vector_scratch_type_3d_view
3413 WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3414 if (local_subpartidx == 0) {
3415 Kokkos::parallel_for
3416 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3417 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3420 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
3421 Kokkos::parallel_for
3422 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3423 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3427 Kokkos::parallel_for
3428 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3429 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3430 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3435 KOKKOS_INLINE_FUNCTION
3437 operator() (
const ComputeSchurTag &,
const member_type &member)
const {
3439 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3440 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3441 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3443 const local_ordinal_type subpartidx = packptr_sub(packidx);
3444 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3445 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3446 const local_ordinal_type partidx = subpartidx%n_parts;
3449 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3455 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
3456 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
3457 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
3459 for (local_ordinal_type i = 0; i < 4; ++i) {
3460 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3461 Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3464 member.team_barrier();
3466 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3468 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
3469 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
3471 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3472 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3474 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3475 <
typename execution_space::memory_space> default_mode_and_algo_type;
3477 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3478 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3480 Kokkos::parallel_for
3481 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3482 for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
3483 local_ordinal_type e_r, e_c, c_kps;
3485 if ( local_subpartidx_schur == 0 ) {
3491 else if ( i == 3 ) {
3496 else if ( i == 4 ) {
3511 else if ( i == 1 ) {
3516 else if ( i == 4 ) {
3521 else if ( i == 5 ) {
3531 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
3532 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3533 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3534 KB::Gemm<member_type,
3535 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3536 default_mode_type,default_algo_type>
3537 ::invoke(member, -one, C, E, one, S);
3542 KOKKOS_INLINE_FUNCTION
3544 operator() (
const FactorizeSchurTag &,
const member_type &member)
const {
3545 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3547 const local_ordinal_type subpartidx = packptr_sub(packidx);
3549 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3550 const local_ordinal_type partidx = subpartidx%n_parts;
3552 const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
3553 const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
3555 internal_vector_scratch_type_3d_view
3556 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3558 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3559 printf(
"FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3562 if (vector_loop_size == 1) {
3563 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3565 Kokkos::parallel_for
3566 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3567 [&](
const local_ordinal_type &v) {
3568 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3574 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3575 const local_ordinal_type team_size =
3576 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3577 recommended_team_size(blocksize, vector_length, internal_vector_length);
3578 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3579 shmem_size(blocksize, blocksize, vector_loop_size);
3582 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3583 printf(
"Start ExtractAndFactorizeSubLineTag\n");
3585 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3586 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
3587 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3590 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3591 writeBTDValuesToFile(n_parts, scalar_values,
"before.mm");
3593 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3594 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3596 execution_space().fence();
3598 writeBTDValuesToFile(n_parts, scalar_values,
"after.mm");
3599 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3600 printf(
"End ExtractAndFactorizeSubLineTag\n");
3604 if (packindices_schur.extent(1) > 0)
3607 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3608 printf(
"Start ExtractBCDTag\n");
3610 Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3611 Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3613 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_before_extract.mm");
3616 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3617 Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
3618 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3620 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3621 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3623 execution_space().fence();
3626 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3627 printf(
"End ExtractBCDTag\n");
3629 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values,
"after_extraction_of_BCD.mm");
3630 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3631 printf(
"Start ComputeETag\n");
3633 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_extract.mm");
3635 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3636 Kokkos::TeamPolicy<execution_space,ComputeETag>
3637 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3639 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3640 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3642 execution_space().fence();
3644 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_compute.mm");
3646 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3647 printf(
"End ComputeETag\n");
3652 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3653 printf(
"Start ComputeSchurTag\n");
3655 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3656 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"before_schur.mm");
3657 Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
3658 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3660 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3662 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_schur.mm");
3663 execution_space().fence();
3664 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3665 printf(
"End ComputeSchurTag\n");
3670 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3671 printf(
"Start FactorizeSchurTag\n");
3673 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3674 Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
3675 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3676 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3677 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3679 execution_space().fence();
3680 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_factor_schur.mm");
3681 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3682 printf(
"End FactorizeSchurTag\n");
3687 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3690 void run_fused_jacobi() {
3691 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3692 const local_ordinal_type team_size =
3693 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3694 recommended_team_size(blocksize, half_vector_length, 1);
3695 const local_ordinal_type per_team_scratch =
3696 btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * half_vector_length);
3698 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
3699 Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
3700 policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
3702 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3703 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
3706 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3713 template<
typename MatrixType>
3716 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3717 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3719 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
3720 bool use_fused_jacobi) {
3722 using execution_space =
typename impl_type::execution_space;
3723 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3724 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3725 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
3727 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase", NumericPhase);
3729 int blocksize = btdm.values.extent(1);
3732 int scratch_required;
3733 if(!use_fused_jacobi) {
3735 scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3739 scratch_required = btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, 2 * impl_type::half_vector_length);
3742 int max_scratch = team_policy_type::scratch_size_max(0);
3744 if(scratch_required < max_scratch) {
3746 ExtractAndFactorizeTridiags<MatrixType, 0>
function(btdm, interf, A, G, tiny);
3747 if(!use_fused_jacobi)
3750 function.run_fused_jacobi();
3754 ExtractAndFactorizeTridiags<MatrixType, 1>
function(btdm, interf, A, G, tiny);
3755 if(!use_fused_jacobi)
3758 function.run_fused_jacobi();
3760 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3766 template<
typename MatrixType>
3770 using execution_space =
typename impl_type::execution_space;
3771 using memory_space =
typename impl_type::memory_space;
3773 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3775 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3776 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
3777 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3778 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3779 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3780 using const_impl_scalar_type_2d_view_tpetra =
typename impl_scalar_type_2d_view_tpetra::const_type;
3781 static constexpr
int vector_length = impl_type::vector_length;
3783 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
3787 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3788 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3789 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3790 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3791 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3792 const local_ordinal_type blocksize;
3793 const local_ordinal_type num_vectors;
3796 vector_type_3d_view packed_multivector;
3797 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3799 template<
typename TagType>
3800 KOKKOS_INLINE_FUNCTION
3801 void copy_multivectors(
const local_ordinal_type &j,
3802 const local_ordinal_type &vi,
3803 const local_ordinal_type &pri,
3804 const local_ordinal_type &ri0)
const {
3805 for (local_ordinal_type col=0;col<num_vectors;++col)
3806 for (local_ordinal_type i=0;i<blocksize;++i)
3807 packed_multivector(pri, i, col)[vi] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3813 const vector_type_3d_view &pmv)
3814 : partptr(interf.partptr),
3815 packptr(interf.packptr),
3816 part2packrowidx0(interf.part2packrowidx0),
3817 part2rowidx0(interf.part2rowidx0),
3818 lclrow(interf.lclrow),
3819 blocksize(pmv.extent(1)),
3820 num_vectors(pmv.extent(2)),
3821 packed_multivector(pmv) {}
3824 KOKKOS_INLINE_FUNCTION
3826 operator() (
const local_ordinal_type &packidx)
const {
3827 local_ordinal_type partidx = packptr(packidx);
3828 local_ordinal_type npacks = packptr(packidx+1) - partidx;
3829 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3831 local_ordinal_type ri0[vector_length] = {};
3832 local_ordinal_type nrows[vector_length] = {};
3833 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
3834 ri0[v] = part2rowidx0(partidx);
3835 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
3837 for (local_ordinal_type j=0;j<nrows[0];++j) {
3838 local_ordinal_type cnt = 1;
3839 for (;cnt<npacks && j!= nrows[cnt];++cnt);
3841 const local_ordinal_type pri = pri0 + j;
3842 for (local_ordinal_type col=0;col<num_vectors;++col)
3843 for (local_ordinal_type i=0;i<blocksize;++i)
3844 for (local_ordinal_type v=0;v<npacks;++v)
3845 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
3849 KOKKOS_INLINE_FUNCTION
3851 operator() (
const member_type &member)
const {
3852 const local_ordinal_type packidx = member.league_rank();
3853 const local_ordinal_type partidx_begin = packptr(packidx);
3854 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
3855 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3856 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
3857 const local_ordinal_type partidx = partidx_begin + v;
3858 const local_ordinal_type ri0 = part2rowidx0(partidx);
3859 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
3862 const local_ordinal_type pri = pri0;
3863 for (local_ordinal_type col=0;col<num_vectors;++col) {
3864 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
3865 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
3869 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
3870 const local_ordinal_type pri = pri0 + j;
3871 for (local_ordinal_type col=0;col<num_vectors;++col)
3872 for (local_ordinal_type i=0;i<blocksize;++i)
3873 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3879 void run(
const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3880 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3881 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3883 scalar_multivector = scalar_multivector_;
3884 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3885 const local_ordinal_type vl = vector_length;
3886 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3887 Kokkos::parallel_for
3888 (
"MultiVectorConverter::TeamPolicy", policy, *
this);
3890 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3891 Kokkos::parallel_for
3892 (
"MultiVectorConverter::RangePolicy", policy, *
this);
3894 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3895 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3904 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3905 typedef KB::Mode::Serial mode_type;
3906 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3907 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3908 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3910 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3912 static int recommended_team_size(
const int ,
3919 #if defined(KOKKOS_ENABLE_CUDA)
3920 static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
3921 const int vector_length,
3922 const int internal_vector_length) {
3923 const int vector_size = vector_length/internal_vector_length;
3924 int total_team_size(0);
3925 if (blksize <= 5) total_team_size = 32;
3926 else if (blksize <= 9) total_team_size = 32;
3927 else if (blksize <= 12) total_team_size = 96;
3928 else if (blksize <= 16) total_team_size = 128;
3929 else if (blksize <= 20) total_team_size = 160;
3930 else total_team_size = 160;
3931 return total_team_size/vector_size;
3935 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3936 typedef KB::Mode::Team mode_type;
3937 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3938 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3939 static int recommended_team_size(
const int blksize,
3940 const int vector_length,
3941 const int internal_vector_length) {
3942 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3946 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3947 typedef KB::Mode::Team mode_type;
3948 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3949 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3950 static int recommended_team_size(
const int blksize,
3951 const int vector_length,
3952 const int internal_vector_length) {
3953 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3958 #if defined(KOKKOS_ENABLE_HIP)
3959 static inline int SolveTridiagsRecommendedHIPTeamSize(
const int blksize,
3960 const int vector_length,
3961 const int internal_vector_length) {
3962 const int vector_size = vector_length/internal_vector_length;
3963 int total_team_size(0);
3964 if (blksize <= 5) total_team_size = 32;
3965 else if (blksize <= 9) total_team_size = 32;
3966 else if (blksize <= 12) total_team_size = 96;
3967 else if (blksize <= 16) total_team_size = 128;
3968 else if (blksize <= 20) total_team_size = 160;
3969 else total_team_size = 160;
3970 return total_team_size/vector_size;
3974 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3975 typedef KB::Mode::Team mode_type;
3976 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3977 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3978 static int recommended_team_size(
const int blksize,
3979 const int vector_length,
3980 const int internal_vector_length) {
3981 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3985 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3986 typedef KB::Mode::Team mode_type;
3987 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3988 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3989 static int recommended_team_size(
const int blksize,
3990 const int vector_length,
3991 const int internal_vector_length) {
3992 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3997 #if defined(KOKKOS_ENABLE_SYCL)
3998 static inline int SolveTridiagsRecommendedSYCLTeamSize(
const int blksize,
3999 const int vector_length,
4000 const int internal_vector_length) {
4001 const int vector_size = vector_length/internal_vector_length;
4002 int total_team_size(0);
4003 if (blksize <= 5) total_team_size = 32;
4004 else if (blksize <= 9) total_team_size = 32;
4005 else if (blksize <= 12) total_team_size = 96;
4006 else if (blksize <= 16) total_team_size = 128;
4007 else if (blksize <= 20) total_team_size = 160;
4008 else total_team_size = 160;
4009 return total_team_size/vector_size;
4013 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
4014 typedef KB::Mode::Team mode_type;
4015 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4016 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4017 static int recommended_team_size(
const int blksize,
4018 const int vector_length,
4019 const int internal_vector_length) {
4020 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4024 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
4025 typedef KB::Mode::Team mode_type;
4026 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
4027 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
4028 static int recommended_team_size(
const int blksize,
4029 const int vector_length,
4030 const int internal_vector_length) {
4031 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
4039 template<
typename MatrixType>
4040 struct SolveTridiags {
4042 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
4043 using execution_space =
typename impl_type::execution_space;
4045 using local_ordinal_type =
typename impl_type::local_ordinal_type;
4048 using magnitude_type =
typename impl_type::magnitude_type;
4049 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
4050 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
4052 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
4053 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
4054 using size_type_2d_view =
typename impl_type::size_type_2d_view;
4056 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
4057 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
4058 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
4059 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
4061 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
4063 using internal_vector_type =
typename impl_type::internal_vector_type;
4064 static constexpr
int vector_length = impl_type::vector_length;
4065 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
4068 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
4069 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
4072 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
4073 using member_type =
typename team_policy_type::member_type;
4077 local_ordinal_type n_subparts_per_part;
4078 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
4079 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
4080 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
4081 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
4082 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
4083 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
4084 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
4085 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
4087 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
4088 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
4091 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
4094 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
4095 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
4096 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
4098 internal_vector_type_4d_view X_internal_vector_values_schur;
4100 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
4101 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
4104 const local_ordinal_type vector_loop_size;
4107 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
4108 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
4109 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4111 Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
4113 const impl_scalar_type df;
4114 const bool compute_diff;
4117 SolveTridiags(
const BlockHelperDetails::PartInterface<MatrixType> &interf,
4118 const BlockTridiags<MatrixType> &btdm,
4119 const vector_type_3d_view &pmv,
4120 const impl_scalar_type damping_factor,
4121 const bool is_norm_manager_active)
4124 n_subparts_per_part(interf.n_subparts_per_part),
4125 partptr(interf.partptr),
4126 packptr(interf.packptr),
4127 packindices_sub(interf.packindices_sub),
4128 packindices_schur(interf.packindices_schur),
4129 part2packrowidx0(interf.part2packrowidx0),
4130 part2packrowidx0_sub(interf.part2packrowidx0_sub),
4131 lclrow(interf.lclrow),
4132 packptr_sub(interf.packptr_sub),
4133 partptr_sub(interf.partptr_sub),
4134 pack_td_ptr_schur(btdm.pack_td_ptr_schur),
4136 pack_td_ptr(btdm.pack_td_ptr),
4137 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
4138 btdm.values.extent(0),
4139 btdm.values.extent(1),
4140 btdm.values.extent(2),
4141 vector_length/internal_vector_length),
4142 X_internal_vector_values((internal_vector_type*)pmv.data(),
4146 vector_length/internal_vector_length),
4147 X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
4153 2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
4156 vector_length/internal_vector_length),
4157 D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
4158 btdm.values_schur.extent(0),
4159 btdm.values_schur.extent(1),
4160 btdm.values_schur.extent(2),
4161 vector_length/internal_vector_length),
4162 e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
4163 btdm.e_values.extent(0),
4164 btdm.e_values.extent(1),
4165 btdm.e_values.extent(2),
4166 btdm.e_values.extent(3),
4167 vector_length/internal_vector_length),
4168 vector_loop_size(vector_length/internal_vector_length),
4169 Y_scalar_multivector(),
4172 compute_diff(is_norm_manager_active)
4178 KOKKOS_INLINE_FUNCTION
4180 copyToFlatMultiVector(
const member_type &member,
4181 const local_ordinal_type partidxbeg,
4182 const local_ordinal_type npacks,
4183 const local_ordinal_type pri0,
4184 const local_ordinal_type v,
4185 const local_ordinal_type blocksize,
4186 const local_ordinal_type num_vectors)
const {
4187 const local_ordinal_type vbeg = v*internal_vector_length;
4188 if (vbeg < npacks) {
4189 local_ordinal_type ri0_vals[internal_vector_length] = {};
4190 local_ordinal_type nrows_vals[internal_vector_length] = {};
4191 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4192 const local_ordinal_type partidx = partidxbeg+vv;
4193 ri0_vals[vi] = partptr(partidx);
4194 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
4197 impl_scalar_type z_partial_sum(0);
4198 if (nrows_vals[0] == 1) {
4199 const local_ordinal_type j=0, pri=pri0;
4201 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4202 const local_ordinal_type ri0 = ri0_vals[vi];
4203 const local_ordinal_type nrows = nrows_vals[vi];
4205 Kokkos::parallel_for
4206 (Kokkos::TeamThreadRange(member, blocksize),
4207 [&](
const local_ordinal_type &i) {
4208 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4209 for (local_ordinal_type col=0;col<num_vectors;++col) {
4210 impl_scalar_type &y = Y_scalar_multivector(row,col);
4211 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4215 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4216 z_partial_sum += yd_abs*yd_abs;
4224 Kokkos::parallel_for
4225 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
4226 [&](
const local_ordinal_type &j) {
4227 const local_ordinal_type pri = pri0 + j;
4228 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4229 const local_ordinal_type ri0 = ri0_vals[vi];
4230 const local_ordinal_type nrows = nrows_vals[vi];
4232 for (local_ordinal_type col=0;col<num_vectors;++col) {
4233 for (local_ordinal_type i=0;i<blocksize;++i) {
4234 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4235 impl_scalar_type &y = Y_scalar_multivector(row,col);
4236 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4240 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4241 z_partial_sum += yd_abs*yd_abs;
4250 Z_scalar_vector(member.league_rank()) += z_partial_sum;
4257 template<
typename WWViewType>
4258 KOKKOS_INLINE_FUNCTION
4260 solveSingleVector(
const member_type &member,
4261 const local_ordinal_type &blocksize,
4262 const local_ordinal_type &i0,
4263 const local_ordinal_type &r0,
4264 const local_ordinal_type &nrows,
4265 const local_ordinal_type &v,
4266 const WWViewType &WW)
const {
4268 typedef SolveTridiagsDefaultModeAndAlgo
4269 <
typename execution_space::memory_space> default_mode_and_algo_type;
4271 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4272 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4275 auto A = D_internal_vector_values.data();
4276 auto X = X_internal_vector_values.data();
4279 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4280 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4284 const local_ordinal_type astep = D_internal_vector_values.stride_0();
4285 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
4286 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
4287 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4288 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
4297 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4298 (default_mode_type,default_algo_type,
4301 blocksize,blocksize,
4306 for (local_ordinal_type tr=1;tr<nrows;++tr) {
4307 member.team_barrier();
4308 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4309 (default_mode_type,default_algo_type,
4311 blocksize, blocksize,
4313 A+2*astep, as0, as1,
4317 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4318 (default_mode_type,default_algo_type,
4321 blocksize,blocksize,
4323 A+3*astep, as0, as1,
4331 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4332 (default_mode_type,default_algo_type,
4335 blocksize, blocksize,
4340 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4342 member.team_barrier();
4343 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4344 (default_mode_type,default_algo_type,
4346 blocksize, blocksize,
4348 A+1*astep, as0, as1,
4352 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4353 (default_mode_type,default_algo_type,
4356 blocksize, blocksize,
4365 const local_ordinal_type ws0 = WW.stride_0();
4366 auto W = WW.data() + v;
4367 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
4369 member, blocksize, X, xs0, W, ws0);
4370 member.team_barrier();
4371 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4372 (default_mode_type,default_algo_type,
4374 blocksize, blocksize,
4383 template<
typename WWViewType>
4384 KOKKOS_INLINE_FUNCTION
4386 solveMultiVector(
const member_type &member,
4387 const local_ordinal_type &,
4388 const local_ordinal_type &i0,
4389 const local_ordinal_type &r0,
4390 const local_ordinal_type &nrows,
4391 const local_ordinal_type &v,
4392 const WWViewType &WW)
const {
4394 typedef SolveTridiagsDefaultModeAndAlgo
4395 <
typename execution_space::memory_space> default_mode_and_algo_type;
4397 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4398 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4401 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4402 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4405 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4406 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4409 local_ordinal_type i = i0, r = r0;
4414 KB::Trsm<member_type,
4415 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4416 default_mode_type,default_algo_type>
4417 ::invoke(member, one, A, X1);
4418 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
4419 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
4420 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
4421 member.team_barrier();
4422 KB::Gemm<member_type,
4423 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4424 default_mode_type,default_algo_type>
4425 ::invoke(member, -one, A, X1, one, X2);
4426 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
4427 KB::Trsm<member_type,
4428 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4429 default_mode_type,default_algo_type>
4430 ::invoke(member, one, A, X2);
4431 X1.assign_data( X2.data() );
4435 KB::Trsm<member_type,
4436 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4437 default_mode_type,default_algo_type>
4438 ::invoke(member, one, A, X1);
4439 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4441 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
4442 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
4443 member.team_barrier();
4444 KB::Gemm<member_type,
4445 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4446 default_mode_type,default_algo_type>
4447 ::invoke(member, -one, A, X1, one, X2);
4449 A.assign_data( &D_internal_vector_values(i,0,0,v) );
4450 KB::Trsm<member_type,
4451 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4452 default_mode_type,default_algo_type>
4453 ::invoke(member, one, A, X2);
4454 X1.assign_data( X2.data() );
4458 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4459 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
4460 ::invoke(member, X1, W);
4461 member.team_barrier();
4462 KB::Gemm<member_type,
4463 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4464 default_mode_type,default_algo_type>
4465 ::invoke(member, one, A, W, zero, X1);
4469 template<
int B>
struct SingleVectorTag {};
4470 template<
int B>
struct MultiVectorTag {};
4472 template<
int B>
struct SingleVectorSubLineTag {};
4473 template<
int B>
struct MultiVectorSubLineTag {};
4474 template<
int B>
struct SingleVectorApplyCTag {};
4475 template<
int B>
struct MultiVectorApplyCTag {};
4476 template<
int B>
struct SingleVectorSchurTag {};
4477 template<
int B>
struct MultiVectorSchurTag {};
4478 template<
int B>
struct SingleVectorApplyETag {};
4479 template<
int B>
struct MultiVectorApplyETag {};
4480 template<
int B>
struct SingleVectorCopyToFlatTag {};
4481 template<
int B>
struct SingleZeroingTag {};
4484 KOKKOS_INLINE_FUNCTION
4486 operator() (
const SingleVectorTag<B> &,
const member_type &member)
const {
4487 const local_ordinal_type packidx = member.league_rank();
4488 const local_ordinal_type partidx = packptr(packidx);
4489 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4490 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4491 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4492 const local_ordinal_type r0 = part2packrowidx0(partidx);
4493 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4494 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4495 const local_ordinal_type num_vectors = 1;
4496 internal_vector_scratch_type_3d_view
4497 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4498 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4499 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4501 Kokkos::parallel_for
4502 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4503 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4504 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4509 KOKKOS_INLINE_FUNCTION
4511 operator() (
const MultiVectorTag<B> &,
const member_type &member)
const {
4512 const local_ordinal_type packidx = member.league_rank();
4513 const local_ordinal_type partidx = packptr(packidx);
4514 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4515 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4516 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4517 const local_ordinal_type r0 = part2packrowidx0(partidx);
4518 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4519 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4520 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4522 internal_vector_scratch_type_3d_view
4523 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4524 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4525 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4527 Kokkos::parallel_for
4528 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4529 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4530 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4535 KOKKOS_INLINE_FUNCTION
4537 operator() (
const SingleVectorSubLineTag<B> &,
const member_type &member)
const {
4539 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4541 const local_ordinal_type subpartidx = packptr_sub(packidx);
4542 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4543 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4544 const local_ordinal_type partidx = subpartidx%n_parts;
4546 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4547 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4548 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4549 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4550 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4556 internal_vector_scratch_type_3d_view
4557 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4559 Kokkos::parallel_for
4560 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4561 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4566 KOKKOS_INLINE_FUNCTION
4568 operator() (
const SingleVectorApplyCTag<B> &,
const member_type &member)
const {
4571 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4573 const local_ordinal_type subpartidx = packptr_sub(packidx);
4574 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4575 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4576 const local_ordinal_type partidx = subpartidx%n_parts;
4577 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4580 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4581 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4582 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4584 internal_vector_scratch_type_3d_view
4585 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4589 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
4590 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
4591 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
4596 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4598 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
4599 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
4601 typedef SolveTridiagsDefaultModeAndAlgo
4602 <
typename execution_space::memory_space> default_mode_and_algo_type;
4604 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4605 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4607 if (local_subpartidx == 0) {
4608 Kokkos::parallel_for
4609 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4610 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4611 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4612 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4614 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4615 (default_mode_type,default_algo_type,
4617 blocksize, blocksize,
4619 C.data(), C.stride_0(), C.stride_1(),
4620 v_1.data(), v_1.stride_0(),
4622 v_2.data(), v_2.stride_0());
4625 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4626 Kokkos::parallel_for
4627 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4628 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4629 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4630 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4632 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4633 (default_mode_type,default_algo_type,
4635 blocksize, blocksize,
4637 C.data(), C.stride_0(), C.stride_1(),
4638 v_1.data(), v_1.stride_0(),
4640 v_2.data(), v_2.stride_0());
4644 Kokkos::parallel_for
4645 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4647 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4648 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4649 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4651 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4652 (default_mode_type,default_algo_type,
4654 blocksize, blocksize,
4656 C.data(), C.stride_0(), C.stride_1(),
4657 v_1.data(), v_1.stride_0(),
4659 v_2.data(), v_2.stride_0());
4662 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4663 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4664 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4666 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4667 (default_mode_type,default_algo_type,
4669 blocksize, blocksize,
4671 C.data(), C.stride_0(), C.stride_1(),
4672 v_1.data(), v_1.stride_0(),
4674 v_2.data(), v_2.stride_0());
4681 KOKKOS_INLINE_FUNCTION
4683 operator() (
const SingleVectorSchurTag<B> &,
const member_type &member)
const {
4684 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4686 const local_ordinal_type partidx = packptr_sub(packidx);
4688 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4690 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
4691 const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
4693 const local_ordinal_type r0_schur = nrows * member.league_rank();
4695 internal_vector_scratch_type_3d_view
4696 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4698 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4699 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4700 for (local_ordinal_type i = 0; i < 2; ++i) {
4701 copy3DView<local_ordinal_type>(member,
4702 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4703 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4707 Kokkos::parallel_for
4708 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4709 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4712 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4713 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4714 for (local_ordinal_type i = 0; i < 2; ++i) {
4715 copy3DView<local_ordinal_type>(member,
4716 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4717 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4723 KOKKOS_INLINE_FUNCTION
4725 operator() (
const SingleVectorApplyETag<B> &,
const member_type &member)
const {
4726 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4728 const local_ordinal_type subpartidx = packptr_sub(packidx);
4729 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4730 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4731 const local_ordinal_type partidx = subpartidx%n_parts;
4732 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4734 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4735 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4737 internal_vector_scratch_type_3d_view
4738 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4742 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4744 typedef SolveTridiagsDefaultModeAndAlgo
4745 <
typename execution_space::memory_space> default_mode_and_algo_type;
4747 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4748 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4750 if (local_subpartidx == 0) {
4751 Kokkos::parallel_for
4752 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4754 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4756 for (local_ordinal_type row = 0; row < nrows; ++row) {
4757 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4758 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4760 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4761 (default_mode_type,default_algo_type,
4763 blocksize, blocksize,
4765 E.data(), E.stride_0(), E.stride_1(),
4766 v_2.data(), v_2.stride_0(),
4768 v_1.data(), v_1.stride_0());
4772 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4773 Kokkos::parallel_for
4774 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4775 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4777 for (local_ordinal_type row = 0; row < nrows; ++row) {
4778 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4779 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4781 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4782 (default_mode_type,default_algo_type,
4784 blocksize, blocksize,
4786 E.data(), E.stride_0(), E.stride_1(),
4787 v_2.data(), v_2.stride_0(),
4789 v_1.data(), v_1.stride_0());
4794 Kokkos::parallel_for
4795 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4797 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4799 for (local_ordinal_type row = 0; row < nrows; ++row) {
4800 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4801 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4803 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4804 (default_mode_type,default_algo_type,
4806 blocksize, blocksize,
4808 E.data(), E.stride_0(), E.stride_1(),
4809 v_2.data(), v_2.stride_0(),
4811 v_1.data(), v_1.stride_0());
4815 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4817 for (local_ordinal_type row = 0; row < nrows; ++row) {
4818 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4819 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4821 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4822 (default_mode_type,default_algo_type,
4824 blocksize, blocksize,
4826 E.data(), E.stride_0(), E.stride_1(),
4827 v_2.data(), v_2.stride_0(),
4829 v_1.data(), v_1.stride_0());
4837 KOKKOS_INLINE_FUNCTION
4839 operator() (
const SingleVectorCopyToFlatTag<B> &,
const member_type &member)
const {
4840 const local_ordinal_type packidx = member.league_rank();
4841 const local_ordinal_type partidx = packptr(packidx);
4842 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4843 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4844 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4845 const local_ordinal_type num_vectors = 1;
4847 Kokkos::parallel_for
4848 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4849 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4854 KOKKOS_INLINE_FUNCTION
4856 operator() (
const SingleZeroingTag<B> &,
const member_type &member)
const {
4857 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4858 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4862 void run(
const impl_scalar_type_2d_view_tpetra &Y,
4863 const impl_scalar_type_1d_view &Z) {
4864 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4865 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SolveTridiags", SolveTridiags);
4868 this->Y_scalar_multivector = Y;
4869 this->Z_scalar_vector = Z;
4871 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4872 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4874 const local_ordinal_type team_size =
4875 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4876 recommended_team_size(blocksize, vector_length, internal_vector_length);
4877 const int per_team_scratch = internal_vector_scratch_type_3d_view
4878 ::shmem_size(blocksize, num_vectors, vector_loop_size);
4880 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4881 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4882 if (num_vectors == 1) { \
4883 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4884 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4885 Kokkos::parallel_for \
4886 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4887 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4889 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4890 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4891 Kokkos::parallel_for \
4892 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4893 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4896 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4897 if (num_vectors == 1) { \
4898 if (packindices_schur.extent(1) <= 0) { \
4899 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4900 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4901 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4902 Kokkos::parallel_for \
4903 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4909 Kokkos::TeamPolicy<execution_space,SingleZeroingTag<B> > \
4910 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4911 Kokkos::parallel_for \
4912 ("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4916 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4917 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4918 Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> > \
4919 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4920 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4921 Kokkos::parallel_for \
4922 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4924 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4925 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4928 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4929 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4930 Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> > \
4931 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4932 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4933 Kokkos::parallel_for \
4934 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4936 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4937 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4940 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4941 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4942 Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> > \
4943 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4944 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4945 Kokkos::parallel_for \
4946 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4948 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4949 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4952 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4953 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4954 Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> > \
4955 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4956 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4957 Kokkos::parallel_for \
4958 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4960 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4961 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4965 Kokkos::TeamPolicy<execution_space,SingleVectorCopyToFlatTag<B> > \
4966 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4967 Kokkos::parallel_for \
4968 ("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4973 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4974 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4975 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4976 Kokkos::parallel_for \
4977 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4981 switch (blocksize) {
4982 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
4983 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
4984 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 6);
4985 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
4986 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4987 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4988 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4989 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4990 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4991 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4992 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4993 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4994 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
4996 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4998 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4999 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
5006 template<
typename MatrixType>
5009 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
5010 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
5011 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5012 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
5013 const bool overlap_communication_and_computation,
5015 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
5016 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
5017 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z,
5018 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
5020 const BlockHelperDetails::PartInterface<MatrixType> &interf,
5023 typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work,
5028 const int max_num_sweeps,
5029 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5030 const int check_tol_every) {
5031 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
5034 using node_memory_space =
typename impl_type::node_memory_space;
5035 using local_ordinal_type =
typename impl_type::local_ordinal_type;
5036 using size_type =
typename impl_type::size_type;
5037 using impl_scalar_type =
typename impl_type::impl_scalar_type;
5038 using magnitude_type =
typename impl_type::magnitude_type;
5039 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
5040 using vector_type_1d_view =
typename impl_type::vector_type_1d_view;
5041 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
5042 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
5044 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
5048 "Neither Tpetra importer nor Async importer is null.");
5051 "Maximum number of sweeps must be >= 1.");
5054 const bool is_seq_method_requested = !tpetra_importer.is_null();
5055 const bool is_async_importer_active = !async_importer.is_null();
5056 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5057 const magnitude_type tolerance = tol*tol;
5058 const local_ordinal_type blocksize = btdm.values.extent(1);
5059 const local_ordinal_type num_vectors = Y.getNumVectors();
5060 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
5062 const impl_scalar_type zero(0.0);
5064 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
5065 "The seq method for applyInverseJacobi, " <<
5066 "which in any case is for developer use only, " <<
5067 "does not support norm-based termination.");
5068 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
5069 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
5071 std::invalid_argument,
5072 "The seq method for applyInverseJacobi, " <<
5073 "which in any case is for developer use only, " <<
5074 "only supports memory spaces accessible from host.");
5077 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
5078 if (work.span() < work_span_required)
5079 work = vector_type_1d_view(
"vector workspace 1d view", work_span_required);
5082 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
5083 if (local_ordinal_type(W.extent(0)) < W_size)
5084 W = impl_scalar_type_1d_view(
"W", W_size);
5086 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5088 if (is_seq_method_requested) {
5089 if (Z.getNumVectors() != Y.getNumVectors())
5090 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors,
false);
5092 if (is_async_importer_active) {
5094 async_importer->createDataBuffer(num_vectors);
5095 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5101 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
5102 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5103 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5104 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
5105 if (is_y_zero) Kokkos::deep_copy(YY, zero);
5108 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
5109 damping_factor, is_norm_manager_active);
5111 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
5114 auto A_crs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_crs_matrix_type>(A);
5115 auto A_bcrs = Teuchos::rcp_dynamic_cast<
const typename impl_type::tpetra_block_crs_matrix_type>(A);
5117 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
5120 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph());
5122 BlockHelperDetails::ComputeResidualVector<MatrixType>
5123 compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
5124 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
5128 if (is_norm_manager_active)
5129 norm_manager.setCheckFrequency(check_tol_every);
5133 for (;sweep<max_num_sweeps;++sweep) {
5137 multivector_converter.run(XX);
5139 if (is_seq_method_requested) {
5143 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5144 compute_residual_vector.run(YY, XX, ZZ);
5147 multivector_converter.run(YY);
5151 if (overlap_communication_and_computation || !is_async_importer_active) {
5152 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5154 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
true);
5155 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5156 if (is_async_importer_active) async_importer->cancel();
5159 if (is_async_importer_active) {
5160 async_importer->syncRecv();
5162 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
false);
5165 if (is_async_importer_active)
5166 async_importer->syncExchange(YY);
5167 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
5169 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5177 solve_tridiags.run(YY, W);
5180 if (is_norm_manager_active) {
5182 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5183 if (sweep + 1 == max_num_sweeps) {
5184 norm_manager.ireduce(sweep,
true);
5185 norm_manager.checkDone(sweep + 1, tolerance,
true);
5187 norm_manager.ireduce(sweep);
5195 if (is_norm_manager_active) norm_manager.finalize();
5202 template<
typename MatrixType,
int B>
5204 applyFusedBlockJacobi_Impl(
5205 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5206 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
5207 const bool overlap_communication_and_computation,
5209 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
5210 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
5211 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
5213 const BlockHelperDetails::PartInterface<MatrixType> &interf,
5214 const BlockTridiags<MatrixType> &btdm,
5216 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work,
5221 const int max_num_sweeps,
5222 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5223 const int check_tol_every) {
5225 using node_memory_space =
typename impl_type::node_memory_space;
5226 using local_ordinal_type =
typename impl_type::local_ordinal_type;
5227 using size_type =
typename impl_type::size_type;
5228 using impl_scalar_type =
typename impl_type::impl_scalar_type;
5229 using magnitude_type =
typename impl_type::magnitude_type;
5230 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
5231 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
5232 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
5233 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
5237 "Neither Tpetra importer nor Async importer is null.");
5240 "Maximum number of sweeps must be >= 1.");
5243 const bool is_async_importer_active = !async_importer.is_null();
5244 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
5245 const magnitude_type tolerance = tol*tol;
5246 const local_ordinal_type blocksize = btdm.d_inv.extent(1);
5247 const local_ordinal_type num_vectors = Y.getNumVectors();
5248 const local_ordinal_type num_blockrows = interf.nparts;
5250 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
5252 if (is_async_importer_active) {
5254 async_importer->createDataBuffer(num_vectors);
5255 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
5259 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
5260 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
5262 const bool two_pass_residual =
5263 overlap_communication_and_computation && is_async_importer_active;
5268 size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
5269 "Local LHS vector (YY) has total size " << YY.extent(0) <<
"x" << YY.extent(1) <<
5270 " = " << YY.extent(0) * YY.extent(1) <<
",\n" <<
5271 "but expected " << num_blockrows <<
"x" << blocksize <<
"x" << num_vectors <<
5272 " = " << size_t(num_blockrows) * blocksize * num_vectors <<
'\n');
5273 size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
5274 if (work.extent(0) < work_required) {
5278 Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
5281 if (W.extent(0) != size_t(num_blockrows))
5285 BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
5286 functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
5287 BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
5288 functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5289 BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
5290 functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
5293 if (is_norm_manager_active)
5294 norm_manager.setCheckFrequency(check_tol_every);
5299 Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
5304 for (;sweep < max_num_sweeps; ++sweep) {
5307 functor_solve_only.run(XX, y_buffers[1-current_y]);
5310 if (overlap_communication_and_computation || !is_async_importer_active) {
5311 if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
5312 if(two_pass_residual) {
5315 functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1-current_y]);
5320 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
5322 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5323 if (is_async_importer_active) async_importer->cancel();
5326 if (is_async_importer_active) {
5327 async_importer->syncRecv();
5329 functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
5332 if (is_async_importer_active)
5333 async_importer->syncExchange(y_buffers[current_y]);
5334 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
5336 functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
5341 if (is_norm_manager_active) {
5342 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5343 if (sweep + 1 == max_num_sweeps) {
5344 norm_manager.ireduce(sweep,
true);
5345 norm_manager.checkDone(sweep + 1, tolerance,
true);
5347 norm_manager.ireduce(sweep);
5352 current_y = 1 - current_y;
5354 if(current_y == 1) {
5356 Kokkos::deep_copy(YY, y_doublebuf);
5360 if (is_norm_manager_active) norm_manager.finalize();
5367 template<
typename MatrixType>
5370 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
5371 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
5372 const bool overlap_communication_and_computation,
5374 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
5375 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
5376 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
5378 const BlockHelperDetails::PartInterface<MatrixType> &interf,
5381 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work,
5386 const int max_num_sweeps,
5387 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
5388 const int check_tol_every) {
5389 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
5390 int blocksize = btdm.d_inv.extent(1);
5392 #define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) { \
5393 sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
5394 tpetra_importer, async_importer, overlap_communication_and_computation, \
5395 X, Y, W, interf, btdm, amd, work, \
5396 norm_manager, damping_factor, is_y_zero, \
5397 max_num_sweeps, tol, check_tol_every); \
5399 switch (blocksize) {
5400 case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 3);
5401 case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 5);
5402 case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 7);
5403 case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 9);
5404 case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
5405 case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
5406 case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
5407 case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
5408 case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
5409 default : BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 0);
5411 #undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
5417 template<
typename MatrixType>
5420 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5421 using block_tridiags_type = BlockTridiags<MatrixType>;
5424 using async_import_type = AsyncableImport<MatrixType>;
5431 bool overlap_communication_and_computation;
5434 mutable typename impl_type::tpetra_multivector_type Z;
5435 mutable typename impl_type::impl_scalar_type_1d_view W;
5438 part_interface_type part_interface;
5439 block_tridiags_type block_tridiags;
5443 bool use_fused_jacobi;
5446 mutable typename impl_type::vector_type_1d_view work;
5448 mutable typename impl_type::impl_scalar_type_1d_view work_flat;
5449 mutable norm_manager_type norm_manager;
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:141
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3715
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:890
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5008
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1628
Definition: Ifpack2_BlockHelper.hpp:353
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, bool useSeqMethod, bool use_fused_jacobi)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1870
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:97
int applyFusedBlockJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:5369
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Definition: Ifpack2_BlockHelper.hpp:188
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1049
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2221
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:321
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:164
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:215
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1563
Definition: Ifpack2_BlockComputeResidualVector.hpp:23
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3767