43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
51 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
52 #include <Tpetra_Distributor.hpp>
53 #include <Tpetra_BlockMultiVector.hpp>
55 #include <Kokkos_ArithTraits.hpp>
56 #include <KokkosBatched_Util.hpp>
57 #include <KokkosBatched_Vector.hpp>
58 #include <KokkosBatched_Copy_Decl.hpp>
59 #include <KokkosBatched_Copy_Impl.hpp>
60 #include <KokkosBatched_AddRadial_Decl.hpp>
61 #include <KokkosBatched_AddRadial_Impl.hpp>
62 #include <KokkosBatched_SetIdentity_Decl.hpp>
63 #include <KokkosBatched_SetIdentity_Impl.hpp>
64 #include <KokkosBatched_Gemm_Decl.hpp>
65 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
66 #include <KokkosBatched_Gemm_Team_Impl.hpp>
67 #include <KokkosBatched_Gemv_Decl.hpp>
68 #include <KokkosBatched_Gemv_Team_Impl.hpp>
69 #include <KokkosBatched_Trsm_Decl.hpp>
70 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
71 #include <KokkosBatched_Trsm_Team_Impl.hpp>
72 #include <KokkosBatched_Trsv_Decl.hpp>
73 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
74 #include <KokkosBatched_Trsv_Team_Impl.hpp>
75 #include <KokkosBatched_LU_Decl.hpp>
76 #include <KokkosBatched_LU_Serial_Impl.hpp>
77 #include <KokkosBatched_LU_Team_Impl.hpp>
79 #include <KokkosBlas1_nrm1.hpp>
80 #include <KokkosBlas1_nrm2.hpp>
84 #include "Ifpack2_BlockHelper.hpp"
85 #include "Ifpack2_BlockComputeResidualVector.hpp"
92 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
93 #include "cuda_profiler_api.h"
98 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
106 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
110 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
113 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
114 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
118 #define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
122 namespace BlockTriDiContainerDetails {
124 namespace KB = KokkosBatched;
131 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
132 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
133 MemoryTraitsType::is_random_access |
136 template <
typename ViewType>
137 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
138 typename ViewType::array_layout,
139 typename ViewType::device_type,
140 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
141 template <
typename ViewType>
142 using Atomic = Kokkos::View<
typename ViewType::data_type,
143 typename ViewType::array_layout,
144 typename ViewType::device_type,
145 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
146 template <
typename ViewType>
147 using Const = Kokkos::View<
typename ViewType::const_data_type,
148 typename ViewType::array_layout,
149 typename ViewType::device_type,
150 typename ViewType::memory_traits>;
151 template <
typename ViewType>
152 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
154 template <
typename ViewType>
155 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
157 template <
typename ViewType>
158 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
159 typename ViewType::array_layout,
160 typename ViewType::device_type,
161 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
164 template <
typename ViewType>
165 using Scratch = Kokkos::View<
typename ViewType::data_type,
166 typename ViewType::array_layout,
167 typename ViewType::execution_space::scratch_memory_space,
168 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
174 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
179 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
180 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
181 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
183 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
184 { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
186 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
188 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
194 template<
typename MatrixType>
197 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter");
199 using tpetra_map_type =
typename impl_type::tpetra_map_type;
200 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
201 using tpetra_import_type =
typename impl_type::tpetra_import_type;
203 const auto g = A->getCrsGraph();
204 const auto blocksize = A->getBlockSize();
205 const auto src =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
206 const auto tgt =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
207 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
216 template<
typename MatrixType>
217 struct AsyncableImport {
225 #if !defined(HAVE_IFPACK2_MPI)
226 typedef int MPI_Request;
227 typedef int MPI_Comm;
229 using scalar_type =
typename impl_type::scalar_type;
233 static int isend(
const MPI_Comm comm,
const char* buf,
int count,
int dest,
int tag, MPI_Request* ireq) {
234 #ifdef HAVE_IFPACK2_MPI
236 int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
237 if (ireq == NULL) MPI_Request_free(&ureq);
244 static int irecv(
const MPI_Comm comm,
char* buf,
int count,
int src,
int tag, MPI_Request* ireq) {
245 #ifdef HAVE_IFPACK2_MPI
247 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
248 if (ireq == NULL) MPI_Request_free(&ureq);
255 static int waitany(
int count, MPI_Request* reqs,
int* index) {
256 #ifdef HAVE_IFPACK2_MPI
257 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
263 static int waitall(
int count, MPI_Request* reqs) {
264 #ifdef HAVE_IFPACK2_MPI
265 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
272 using tpetra_map_type =
typename impl_type::tpetra_map_type;
273 using tpetra_import_type =
typename impl_type::tpetra_import_type;
275 using local_ordinal_type =
typename impl_type::local_ordinal_type;
276 using global_ordinal_type =
typename impl_type::global_ordinal_type;
280 using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
281 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
283 using execution_space =
typename impl_type::execution_space;
284 using memory_space =
typename impl_type::memory_space;
285 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
287 using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
289 #if defined(KOKKOS_ENABLE_CUDA)
290 using impl_scalar_type_1d_view =
291 typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
292 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
293 Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
294 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
295 Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
296 # else // no experimental macros are defined
297 typename impl_type::impl_scalar_type_1d_view,
299 typename impl_type::impl_scalar_type_1d_view>::type;
301 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
303 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type*,Kokkos::HostSpace>;
304 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
305 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
307 #ifdef HAVE_IFPACK2_MPI
311 impl_scalar_type_2d_view_tpetra remote_multivector;
312 local_ordinal_type blocksize;
315 struct SendRecvPair {
320 SendRecvPair<int_1d_view_host> pids;
321 SendRecvPair<std::vector<MPI_Request> > reqs;
322 SendRecvPair<size_type_1d_view> offset;
323 SendRecvPair<size_type_1d_view_host> offset_host;
324 SendRecvPair<local_ordinal_type_1d_view> lids;
325 SendRecvPair<impl_scalar_type_1d_view> buffer;
326 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host;
328 local_ordinal_type_1d_view dm2cm;
330 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
331 using exec_instance_1d_std_vector = std::vector<execution_space>;
332 exec_instance_1d_std_vector exec_instances;
338 const size_type_1d_view &offs) {
340 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
341 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
344 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
345 const local_ordinal_type lens_size = lens_device.extent(0);
346 Kokkos::parallel_scan
347 (
"AsyncableImport::RangePolicy::setOffsetValues",
348 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
351 update += (i < lens_size ? lens_device[i] : 0);
356 const size_type_1d_view_host &offs) {
358 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
359 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
363 for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
364 offs(i) = offs(i-1) + lens[i-1];
369 void createMpiRequests(
const tpetra_import_type &
import) {
370 Tpetra::Distributor &distributor =
import.getDistributor();
373 const auto pids_from = distributor.getProcsFrom();
375 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(int)*pids.recv.extent(0));
377 const auto pids_to = distributor.getProcsTo();
379 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(int)*pids.send.extent(0));
382 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*
sizeof(MPI_Request));
383 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*
sizeof(MPI_Request));
387 const auto lengths_to = distributor.getLengthsTo();
390 const auto lengths_from = distributor.getLengthsFrom();
393 setOffsetValues(lengths_to, offset.send);
394 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
396 setOffsetValues(lengths_from, offset.recv);
397 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
399 const auto lengths_to = distributor.getLengthsTo();
400 offset_host.send = size_type_1d_view_host(
do_not_initialize_tag(
"offset send"), lengths_to.size() + 1);
402 const auto lengths_from = distributor.getLengthsFrom();
403 offset_host.recv = size_type_1d_view_host(
do_not_initialize_tag(
"offset recv"), lengths_from.size() + 1);
405 setOffsetValuesHost(lengths_to, offset_host.send);
408 setOffsetValuesHost(lengths_from, offset_host.recv);
413 void createSendRecvIDs(
const tpetra_import_type &
import) {
415 const auto remote_lids =
import.getRemoteLIDs();
416 const local_ordinal_type_1d_view_host
417 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
419 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
422 auto epids =
import.getExportPIDs();
423 auto elids =
import.getExportLIDs();
426 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
429 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
430 const auto pid_send_value = pids.send[i];
431 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
432 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
433 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
435 Kokkos::deep_copy(lids.send, lids_send_host);
438 void createExecutionSpaceInstances() {
439 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
442 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
449 struct ToMultiVector {};
453 const local_ordinal_type blocksize_,
454 const local_ordinal_type_1d_view dm2cm_) {
455 blocksize = blocksize_;
458 #ifdef HAVE_IFPACK2_MPI
459 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
461 const tpetra_import_type
import(src_map, tgt_map);
463 createMpiRequests(
import);
464 createSendRecvIDs(
import);
465 createExecutionSpaceInstances();
468 void createDataBuffer(
const local_ordinal_type &num_vectors) {
469 const size_type extent_0 = lids.recv.extent(0)*blocksize;
470 const size_type extent_1 = num_vectors;
471 if (remote_multivector.extent(0) == extent_0 &&
472 remote_multivector.extent(1) == extent_1) {
478 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
479 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
484 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
485 buffer_host.send = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer send"), send_buffer_size);
486 buffer_host.recv = impl_scalar_type_1d_view_host(
do_not_initialize_tag(
"buffer recv"), recv_buffer_size);
492 #ifdef HAVE_IFPACK2_MPI
493 waitall(reqs.recv.size(), reqs.recv.data());
494 waitall(reqs.send.size(), reqs.send.data());
502 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
503 template<
typename PackTag>
505 void copy(
const local_ordinal_type_1d_view &lids_,
506 const impl_scalar_type_1d_view &buffer_,
507 const local_ordinal_type ibeg_,
508 const local_ordinal_type iend_,
509 const impl_scalar_type_2d_view_tpetra &multivector_,
510 const local_ordinal_type blocksize_,
511 const execution_space &exec_instance_) {
512 const local_ordinal_type num_vectors = multivector_.extent(1);
513 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
514 const local_ordinal_type idiff = iend_ - ibeg_;
515 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
517 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
518 local_ordinal_type vector_size(0);
519 if (blocksize_ <= 4) vector_size = 4;
520 else if (blocksize_ <= 8) vector_size = 8;
521 else if (blocksize_ <= 16) vector_size = 16;
522 else vector_size = 32;
524 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
525 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
528 Kokkos::Experimental::require(policy, work_item_property),
529 KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
530 const local_ordinal_type i = member.league_rank();
532 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
533 auto aptr = abase + blocksize_*(i + idiff*j);
534 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
535 if (std::is_same<PackTag,ToBuffer>::value)
537 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
542 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
549 void asyncSendRecvVar1(
const impl_scalar_type_2d_view_tpetra &mv) {
550 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv");
552 #ifdef HAVE_IFPACK2_MPI
554 const local_ordinal_type num_vectors = mv.extent(1);
555 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
558 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
559 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
561 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
562 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
569 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
570 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
578 execution_space().fence();
581 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
583 if (i<8) exec_instances[i%8].fence();
584 copy<ToBuffer>(lids.send, buffer.send,
585 offset_host.send(i), offset_host.send(i+1),
588 exec_instances[i%8]);
589 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
591 const local_ordinal_type num_vectors = mv.extent(1);
592 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
594 Kokkos::deep_copy(exec_instances[i%8],
595 Kokkos::subview(buffer_host.send,
596 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
597 offset_host.send(i)*mv_blocksize,
598 offset_host.send(i+1)*mv_blocksize)),
599 Kokkos::subview(buffer.send,
600 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
601 offset_host.send(i)*mv_blocksize,
602 offset_host.send(i+1)*mv_blocksize)));
607 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
609 if (i<8) exec_instances[i%8].fence();
610 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
612 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
613 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
620 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
621 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
629 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
632 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
634 #endif // HAVE_IFPACK2_MPI
635 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
638 void syncRecvVar1() {
639 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv");
640 #ifdef HAVE_IFPACK2_MPI
642 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
643 local_ordinal_type idx = i;
646 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
648 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
649 const local_ordinal_type num_vectors = remote_multivector.extent(1);
650 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
653 Kokkos::subview(buffer.recv,
654 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
655 offset_host.recv(idx)*mv_blocksize,
656 offset_host.recv(idx+1)*mv_blocksize)),
657 Kokkos::subview(buffer_host.recv,
658 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
659 offset_host.recv(idx)*mv_blocksize,
660 offset_host.recv(idx+1)*mv_blocksize)));
664 copy<ToMultiVector>(lids.recv, buffer.recv,
665 offset_host.recv(idx), offset_host.recv(idx+1),
666 remote_multivector, blocksize,
667 exec_instances[idx%8]);
674 waitall(reqs.send.size(), reqs.send.data());
675 #endif // HAVE_IFPACK2_MPI
676 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
678 #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
685 template<
typename PackTag>
687 void copy(
const local_ordinal_type_1d_view &lids_,
688 const impl_scalar_type_1d_view &buffer_,
689 const local_ordinal_type &ibeg_,
690 const local_ordinal_type &iend_,
691 const impl_scalar_type_2d_view_tpetra &multivector_,
692 const local_ordinal_type blocksize_) {
693 const local_ordinal_type num_vectors = multivector_.extent(1);
694 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
695 const local_ordinal_type idiff = iend_ - ibeg_;
696 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
697 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
698 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
699 local_ordinal_type vector_size(0);
700 if (blocksize_ <= 4) vector_size = 4;
701 else if (blocksize_ <= 8) vector_size = 8;
702 else if (blocksize_ <= 16) vector_size = 16;
703 else vector_size = 32;
704 const team_policy_type policy(idiff, 1, vector_size);
706 (
"AsyncableImport::TeamPolicy::copy",
707 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
708 const local_ordinal_type i = member.league_rank();
710 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
711 auto aptr = abase + blocksize_*(i + idiff*j);
712 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
713 if (std::is_same<PackTag,ToBuffer>::value)
715 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
720 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
726 const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
728 (
"AsyncableImport::RangePolicy::copy",
729 policy, KOKKOS_LAMBDA(
const local_ordinal_type &ij) {
730 const local_ordinal_type i = ij%idiff;
731 const local_ordinal_type j = ij/idiff;
732 auto aptr = abase + blocksize_*(i + idiff*j);
733 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
734 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
735 auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
736 memcpy(to, from,
sizeof(impl_scalar_type)*blocksize_);
745 void asyncSendRecvVar0(
const impl_scalar_type_2d_view_tpetra &mv) {
746 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv");
748 #ifdef HAVE_IFPACK2_MPI
750 const local_ordinal_type num_vectors = mv.extent(1);
751 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
754 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
755 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
757 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
758 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
765 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
766 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
774 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
775 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
778 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
780 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
781 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
788 Kokkos::subview(buffer_host.send,
789 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
790 offset_host.send(i)*mv_blocksize,
791 offset_host.send(i+1)*mv_blocksize)),
792 Kokkos::subview(buffer.send,
793 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
794 offset_host.send(i)*mv_blocksize,
795 offset_host.send(i+1)*mv_blocksize)));
797 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
798 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
807 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
810 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
813 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
816 void syncRecvVar0() {
817 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv");
818 #ifdef HAVE_IFPACK2_MPI
820 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
821 local_ordinal_type idx = i;
822 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
823 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
824 const local_ordinal_type num_vectors = remote_multivector.extent(1);
825 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
827 Kokkos::subview(buffer.recv,
828 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
829 offset_host.recv(idx)*mv_blocksize,
830 offset_host.recv(idx+1)*mv_blocksize)),
831 Kokkos::subview(buffer_host.recv,
832 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
833 offset_host.recv(idx)*mv_blocksize,
834 offset_host.recv(idx+1)*mv_blocksize)));
836 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
837 remote_multivector, blocksize);
840 waitall(reqs.send.size(), reqs.send.data());
842 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
848 void asyncSendRecv(
const impl_scalar_type_2d_view_tpetra &mv) {
849 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
850 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
851 asyncSendRecvVar1(mv);
853 asyncSendRecvVar0(mv);
856 asyncSendRecvVar0(mv);
860 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
861 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
871 void syncExchange(
const impl_scalar_type_2d_view_tpetra &mv) {
872 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange");
875 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
878 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
884 template<
typename MatrixType>
888 using tpetra_map_type =
typename impl_type::tpetra_map_type;
889 using local_ordinal_type =
typename impl_type::local_ordinal_type;
890 using global_ordinal_type =
typename impl_type::global_ordinal_type;
891 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
893 const auto g = A->getCrsGraph();
894 const auto blocksize = A->getBlockSize();
895 const auto domain_map = g.getDomainMap();
896 const auto column_map = g.getColMap();
898 std::vector<global_ordinal_type> gids;
899 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
900 for (
size_t i=0;i<column_map->getLocalNumElements();++i) {
901 const global_ordinal_type gid = column_map->getGlobalElement(i);
902 if (!domain_map->isNodeGlobalElement(gid)) {
905 }
else if (found_first) {
906 separate_remotes =
false;
909 if (!need_owned_permutation &&
910 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
919 need_owned_permutation =
true;
923 if (separate_remotes) {
925 const auto parsimonious_col_map
926 =
Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
927 if (parsimonious_col_map->getGlobalNumElements() > 0) {
929 local_ordinal_type_1d_view dm2cm;
930 if (need_owned_permutation) {
931 dm2cm = local_ordinal_type_1d_view(
do_not_initialize_tag(
"dm2cm"), domain_map->getLocalNumElements());
932 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
933 for (
size_t i=0;i<domain_map->getLocalNumElements();++i)
934 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
935 Kokkos::deep_copy(dm2cm, dm2cm_host);
937 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
940 return Teuchos::null;
943 template<
typename local_ordinal_type>
944 local_ordinal_type costTRSM(
const local_ordinal_type block_size) {
945 return block_size*block_size;
948 template<
typename local_ordinal_type>
949 local_ordinal_type costGEMV(
const local_ordinal_type block_size) {
950 return 2*block_size*block_size;
953 template<
typename local_ordinal_type>
954 local_ordinal_type costTriDiagSolve(
const local_ordinal_type subline_length,
const local_ordinal_type block_size) {
955 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length-1) * costGEMV(block_size);
958 template<
typename local_ordinal_type>
959 local_ordinal_type costSolveSchur(
const local_ordinal_type num_parts,
960 const local_ordinal_type num_teams,
961 const local_ordinal_type line_length,
962 const local_ordinal_type block_size,
963 const local_ordinal_type n_subparts_per_part) {
964 const local_ordinal_type subline_length = ceil(
double(line_length - (n_subparts_per_part-1) * 2) / n_subparts_per_part);
965 if (subline_length < 1) {
969 const local_ordinal_type p_n_lines = ceil(
double(num_parts)/num_teams);
970 const local_ordinal_type p_n_sublines = ceil(
double(n_subparts_per_part)*num_parts/num_teams);
971 const local_ordinal_type p_n_sublines_2 = ceil(
double(n_subparts_per_part-1)*num_parts/num_teams);
973 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
974 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part-1)*2,block_size);
975 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length,block_size);
976 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
978 if (n_subparts_per_part == 1) {
979 return p_costApplyAinv;
981 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
984 template<
typename local_ordinal_type>
985 local_ordinal_type getAutomaticNSubparts(
const local_ordinal_type num_parts,
986 const local_ordinal_type num_teams,
987 const local_ordinal_type line_length,
988 const local_ordinal_type block_size) {
989 local_ordinal_type n_subparts_per_part_0 = 1;
990 local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
991 local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0+1);
992 while (flop_0 > flop_1) {
994 flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0)+1);
996 return n_subparts_per_part_0;
999 template<
typename ArgActiveExecutionMemorySpace>
1000 struct SolveTridiagsDefaultModeAndAlgo;
1005 template<
typename MatrixType>
1006 BlockHelperDetails::PartInterface<MatrixType>
1009 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1010 IFPACK2_BLOCKHELPER_TIMER(
"createPartInterface");
1012 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1013 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1014 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
1015 using size_type =
typename impl_type::size_type;
1017 const auto blocksize = A->getBlockSize();
1018 constexpr
int vector_length = impl_type::vector_length;
1019 constexpr
int internal_vector_length = impl_type::internal_vector_length;
1021 const auto comm = A->getRowMap()->getComm();
1023 BlockHelperDetails::PartInterface<MatrixType> interf;
1025 const bool jacobi = partitions.size() == 0;
1026 const local_ordinal_type A_n_lclrows = A->getLocalNumRows();
1027 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1029 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1030 std::vector<size_idx_pair_type> partsz(nparts);
1033 for (local_ordinal_type i=0;i<nparts;++i)
1034 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1035 std::sort(partsz.begin(), partsz.end(),
1036 [] (
const size_idx_pair_type& x,
const size_idx_pair_type& y) {
1037 return x.first > y.first;
1041 local_ordinal_type n_subparts_per_part;
1042 if (n_subparts_per_part_in == -1) {
1045 using execution_space =
typename impl_type::execution_space;
1047 const int line_length = partsz[0].first;
1049 const local_ordinal_type team_size =
1050 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1051 recommended_team_size(blocksize, vector_length, internal_vector_length);
1053 const local_ordinal_type num_teams = execution_space().concurrency() / (team_size * vector_length);
1055 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1057 printf(
"Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1060 n_subparts_per_part = n_subparts_per_part_in;
1064 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1067 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
1069 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1070 local_ordinal_type nrows = 0;
1074 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1077 (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) <<
"The #rows implied by the local partition is not "
1078 <<
"the same as getLocalNumRows: " << nrows <<
" vs " << A_n_lclrows);
1082 std::vector<local_ordinal_type> p;
1084 interf.max_partsz = 1;
1085 interf.max_subpartsz = 0;
1086 interf.n_subparts_per_part = 1;
1087 interf.nparts = nparts;
1092 for (local_ordinal_type i=0;i<nparts;++i)
1093 p[i] = partsz[i].second;
1095 interf.max_partsz = partsz[0].first;
1097 constexpr local_ordinal_type connection_length = 2;
1098 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1099 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1101 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1102 interf.n_subparts_per_part = n_subparts_per_part;
1103 interf.nparts = nparts;
1109 interf.part2rowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0"), nparts + 1);
1110 interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
1113 interf.part2rowidx0_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1114 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1115 interf.rowidx2part_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"rowidx2part"), A_n_lclrows);
1117 interf.partptr_sub = local_ordinal_type_2d_view(
do_not_initialize_tag(
"partptr_sub"), n_sub_parts_and_schur, 2);
1120 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1121 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1123 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1124 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1125 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1126 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1128 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1129 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1130 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1133 interf.row_contiguous =
true;
1135 part2rowidx0(0) = 0;
1136 part2packrowidx0(0) = 0;
1137 local_ordinal_type pack_nrows = 0;
1138 local_ordinal_type pack_nrows_sub = 0;
1140 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices (Jacobi)");
1141 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1142 constexpr local_ordinal_type ipnrows = 1;
1144 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1147 if (ip % vector_length == 0) pack_nrows = ipnrows;
1148 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1149 const local_ordinal_type offset = partptr(ip);
1150 for (local_ordinal_type i=0;i<ipnrows;++i) {
1151 const auto lcl_row = ip;
1153 BlockHelperDetails::get_msg_prefix(comm)
1154 <<
"partitions[" << p[ip] <<
"]["
1155 << i <<
"] = " << lcl_row
1156 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
1158 lclrow(offset+i) = lcl_row;
1159 rowidx2part(offset+i) = ip;
1160 if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1161 interf.row_contiguous =
false;
1163 partptr(ip+1) = offset + ipnrows;
1165 part2rowidx0_sub(0) = 0;
1166 partptr_sub(0, 0) = 0;
1168 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1169 constexpr local_ordinal_type ipnrows = 1;
1170 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1173 (full_line_length != ipnrows, std::logic_error,
1174 "In the part " << ip );
1176 constexpr local_ordinal_type connection_length = 2;
1178 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1180 (
true, std::logic_error,
1181 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1183 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1184 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1186 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1188 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1189 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1190 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1191 if (local_sub_ip != n_subparts_per_part-1) {
1192 if (local_sub_ip != 0) {
1193 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1196 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1198 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1199 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1200 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1202 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1203 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1205 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1206 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1207 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1211 if (local_sub_ip != 0) {
1212 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1215 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1217 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1219 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1221 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1222 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1228 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1229 std::cout <<
"partptr_sub = " << std::endl;
1230 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1231 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1232 std::cout << partptr_sub(i,j) <<
" ";
1234 std::cout << std::endl;
1236 std::cout <<
"partptr_sub end" << std::endl;
1240 local_ordinal_type npacks = ceil(
float(nparts)/vector_length);
1242 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1243 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1244 part2packrowidx0_sub(ip, 0) = 0;
1246 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1248 local_ordinal_type ip_min = ipack*vector_length;
1249 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1250 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1251 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1255 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1256 local_ordinal_type ip_min = ipack*vector_length;
1257 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1259 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1261 constexpr local_ordinal_type connection_length = 2;
1263 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1264 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1266 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1267 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1268 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1270 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1272 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1273 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1278 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1280 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1282 IFPACK2_BLOCKHELPER_TIMER(
"compute part indices");
1283 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1284 const auto* part = &partitions[p[ip]];
1285 const local_ordinal_type ipnrows = part->size();
1286 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1288 BlockHelperDetails::get_msg_prefix(comm)
1289 <<
"partition " << p[ip]
1290 <<
" is empty, which is not allowed.");
1292 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1295 if (ip % vector_length == 0) pack_nrows = ipnrows;
1296 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1297 const local_ordinal_type offset = partptr(ip);
1298 for (local_ordinal_type i=0;i<ipnrows;++i) {
1299 const auto lcl_row = (*part)[i];
1301 BlockHelperDetails::get_msg_prefix(comm)
1302 <<
"partitions[" << p[ip] <<
"]["
1303 << i <<
"] = " << lcl_row
1304 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
1306 lclrow(offset+i) = lcl_row;
1307 rowidx2part(offset+i) = ip;
1308 if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1309 interf.row_contiguous =
false;
1311 partptr(ip+1) = offset + ipnrows;
1313 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1314 printf(
"Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1315 printf(
"partptr(%d+1) = %d\n", ip, partptr(ip+1));
1319 part2rowidx0_sub(0) = 0;
1320 partptr_sub(0, 0) = 0;
1323 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1324 const auto* part = &partitions[p[ip]];
1325 const local_ordinal_type ipnrows = part->size();
1326 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1329 (full_line_length != ipnrows, std::logic_error,
1330 "In the part " << ip );
1332 constexpr local_ordinal_type connection_length = 2;
1334 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1336 (
true, std::logic_error,
1337 "The part " << ip <<
" is too short to use " << n_subparts_per_part <<
" sub parts.");
1339 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1340 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1342 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1344 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1345 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1346 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1347 if (local_sub_ip != n_subparts_per_part-1) {
1348 if (local_sub_ip != 0) {
1349 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1352 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1354 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1355 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1356 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1358 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1359 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1361 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1362 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1363 printf(
"Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1367 if (local_sub_ip != 0) {
1368 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1371 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1373 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1375 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1377 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1378 printf(
"Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1385 local_ordinal_type npacks = ceil(
float(nparts)/vector_length);
1387 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1388 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1389 part2packrowidx0_sub(ip, 0) = 0;
1391 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1393 local_ordinal_type ip_min = ipack*vector_length;
1394 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1395 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1396 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1400 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1401 local_ordinal_type ip_min = ipack*vector_length;
1402 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1404 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1406 constexpr local_ordinal_type connection_length = 2;
1408 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1409 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1411 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1412 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1413 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1415 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1417 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1418 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1423 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1425 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1427 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1430 if (lclrow(0) != 0) interf.row_contiguous =
false;
1432 Kokkos::deep_copy(interf.partptr, partptr);
1433 Kokkos::deep_copy(interf.lclrow, lclrow);
1435 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1438 interf.part2rowidx0 = interf.partptr;
1439 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1441 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1442 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1445 IFPACK2_BLOCKHELPER_TIMER(
"Fill packptr");
1446 local_ordinal_type npacks = ceil(
float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1448 for (local_ordinal_type ip=1;ip<=nparts;++ip)
1449 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1453 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1455 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1456 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1459 Kokkos::deep_copy(interf.packptr, packptr);
1461 local_ordinal_type npacks_per_subpart = ceil(
float(nparts)/vector_length);
1462 npacks = ceil(
float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1464 interf.packindices_sub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"packindices_sub"), npacks_per_subpart*n_subparts_per_part);
1465 interf.packindices_schur = local_ordinal_type_2d_view(
do_not_initialize_tag(
"packindices_schur"), npacks_per_subpart,n_subparts_per_part-1);
1467 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1468 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1472 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
1473 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1474 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1475 packindices_schur(local_pack_ip,local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1479 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1480 packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
1483 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1484 std::cout <<
"packindices_sub = " << std::endl;
1485 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1486 std::cout << packindices_sub(i) <<
" ";
1488 std::cout << std::endl;
1489 std::cout <<
"packindices_sub end" << std::endl;
1491 std::cout <<
"packindices_schur = " << std::endl;
1492 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1493 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1494 std::cout << packindices_schur(i,j) <<
" ";
1496 std::cout << std::endl;
1499 std::cout <<
"packindices_schur end" << std::endl;
1502 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1503 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1506 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1508 for (local_ordinal_type k=0;k<npacks + 1;++k)
1509 packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1511 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1512 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1514 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1522 template <
typename MatrixType>
1525 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1527 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1528 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1529 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1535 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1538 local_ordinal_type_1d_view A_colindsub;
1541 vector_type_3d_view values;
1544 vector_type_3d_view values_schur;
1546 vector_type_4d_view e_values;
1548 bool is_diagonal_only;
1554 template <
typename idx_type>
1555 static KOKKOS_FORCEINLINE_FUNCTION
1556 idx_type IndexToRow (
const idx_type& ind) {
return (ind + 1) / 3; }
1559 template <
typename idx_type>
1560 static KOKKOS_FORCEINLINE_FUNCTION
1561 idx_type RowToIndex (
const idx_type& row) {
return row > 0 ? 3*row - 1 : 0; }
1563 template <
typename idx_type>
1564 static KOKKOS_FORCEINLINE_FUNCTION
1565 idx_type NumBlocks (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows - 2 : 0; }
1567 template <
typename idx_type>
1568 static KOKKOS_FORCEINLINE_FUNCTION
1569 idx_type NumBlocksSchur (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows + 2 : 0; }
1576 template<
typename MatrixType>
1579 IFPACK2_BLOCKHELPER_TIMER(
"createBlockTridiags");
1581 using execution_space =
typename impl_type::execution_space;
1582 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1583 using size_type =
typename impl_type::size_type;
1584 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1586 constexpr
int vector_length = impl_type::vector_length;
1590 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1593 btdm.flat_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1594 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
1595 Kokkos::parallel_scan
1596 (
"createBlockTridiags::RangePolicy::flat_td_ptr",
1597 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1598 const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
1599 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1602 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1604 if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
1605 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
1606 if (local_subpartidx % 2 == 0)
1607 update += btdm.NumBlocks(nrows);
1609 update += btdm.NumBlocksSchur(nrows);
1613 const auto nblocks = Kokkos::create_mirror_view_and_copy
1614 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
1615 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
1619 if (vector_length == 1) {
1620 btdm.pack_td_ptr = btdm.flat_td_ptr;
1624 local_ordinal_type npacks_per_subpart = 0;
1625 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1626 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1627 for (local_ordinal_type ip=1;ip<=interf.nparts;++ip)
1628 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1629 ++npacks_per_subpart;
1631 btdm.pack_td_ptr = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1632 const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
1634 Kokkos::parallel_for
1635 (
"createBlockTridiags::RangePolicy::pack_td_ptr",
1636 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1637 for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
1638 const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
1639 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
1641 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1642 const local_ordinal_type partidx = parti%interf.nparts;
1644 for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
1645 btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
1651 btdm.pack_td_ptr_schur = size_type_2d_view(
do_not_initialize_tag(
"btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1653 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1654 constexpr local_ordinal_type connection_length = 2;
1656 host_pack_td_ptr_schur(0,0) = 0;
1657 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1658 if (i % vector_length == 0) {
1660 host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
1661 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
1662 host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
1666 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1667 host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
1672 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1674 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1675 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1676 std::cout <<
"flat_td_ptr = " << std::endl;
1677 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1678 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1679 std::cout << host_flat_td_ptr(i,j) <<
" ";
1681 std::cout << std::endl;
1683 std::cout <<
"flat_td_ptr end" << std::endl;
1685 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1687 std::cout <<
"pack_td_ptr = " << std::endl;
1688 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1689 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1690 std::cout << host_pack_td_ptr(i,j) <<
" ";
1692 std::cout << std::endl;
1694 std::cout <<
"pack_td_ptr end" << std::endl;
1697 std::cout <<
"pack_td_ptr_schur = " << std::endl;
1698 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1699 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1700 std::cout << host_pack_td_ptr_schur(i,j) <<
" ";
1702 std::cout << std::endl;
1704 std::cout <<
"pack_td_ptr_schur end" << std::endl;
1708 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1723 template<
typename MatrixType>
1725 setTridiagsToIdentity
1726 (
const BlockTridiags<MatrixType>& btdm,
1727 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1730 using execution_space =
typename impl_type::execution_space;
1731 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1732 using size_type_2d_view =
typename impl_type::size_type_2d_view;
1734 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1735 const local_ordinal_type blocksize = btdm.values.extent(1);
1738 const int vector_length = impl_type::vector_length;
1739 const int internal_vector_length = impl_type::internal_vector_length;
1741 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
1742 using internal_vector_type =
typename impl_type::internal_vector_type;
1743 using internal_vector_type_4d_view =
1744 typename impl_type::internal_vector_type_4d_view;
1746 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1747 const internal_vector_type_4d_view values
1748 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1749 btdm.values.extent(0),
1750 btdm.values.extent(1),
1751 btdm.values.extent(2),
1752 vector_length/internal_vector_length);
1753 const local_ordinal_type vector_loop_size = values.extent(3);
1754 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1755 local_ordinal_type total_team_size(0);
1756 if (blocksize <= 5) total_team_size = 32;
1757 else if (blocksize <= 9) total_team_size = 64;
1758 else if (blocksize <= 12) total_team_size = 96;
1759 else if (blocksize <= 16) total_team_size = 128;
1760 else if (blocksize <= 20) total_team_size = 160;
1761 else total_team_size = 160;
1762 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1763 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1764 #elif defined(KOKKOS_ENABLE_HIP)
1769 local_ordinal_type total_team_size(0);
1770 if (blocksize <= 5) total_team_size = 32;
1771 else if (blocksize <= 9) total_team_size = 64;
1772 else if (blocksize <= 12) total_team_size = 96;
1773 else if (blocksize <= 16) total_team_size = 128;
1774 else if (blocksize <= 20) total_team_size = 160;
1775 else total_team_size = 160;
1776 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1777 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1778 #elif defined(KOKKOS_ENABLE_SYCL)
1780 local_ordinal_type total_team_size(0);
1781 if (blocksize <= 5) total_team_size = 32;
1782 else if (blocksize <= 9) total_team_size = 64;
1783 else if (blocksize <= 12) total_team_size = 96;
1784 else if (blocksize <= 16) total_team_size = 128;
1785 else if (blocksize <= 20) total_team_size = 160;
1786 else total_team_size = 160;
1787 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1788 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1791 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1793 Kokkos::parallel_for
1794 (
"setTridiagsToIdentity::TeamPolicy",
1795 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1796 const local_ordinal_type k = member.league_rank();
1797 const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
1798 const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
1800 const local_ordinal_type diff = iend - ibeg;
1801 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1802 const btdm_scalar_type one(1);
1803 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
1804 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](
const local_ordinal_type &ii) {
1805 const local_ordinal_type i = ibeg + ii*3;
1806 for (local_ordinal_type j=0;j<blocksize;++j) {
1807 values(i,j,j,v) = one;
1818 template<
typename MatrixType>
1821 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1824 const bool overlap_communication_and_computation) {
1825 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SymbolicPhase");
1830 using host_execution_space =
typename impl_type::host_execution_space;
1832 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1833 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1834 using size_type =
typename impl_type::size_type;
1835 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1836 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1837 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1838 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
1839 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1841 constexpr
int vector_length = impl_type::vector_length;
1843 const auto comm = A->getRowMap()->getComm();
1845 const auto& g = A->getCrsGraph();
1847 const auto blocksize = A->getBlockSize();
1850 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1851 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1852 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1853 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1854 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1856 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1858 Kokkos::View<local_ordinal_type*,host_execution_space> col2row(
"col2row", A->getLocalNumCols());
1864 const auto rowmap = g.getRowMap();
1865 const auto colmap = g.getColMap();
1866 const auto dommap = g.getDomainMap();
1867 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1869 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1870 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1871 Kokkos::parallel_for
1872 (
"performSymbolicPhase::RangePolicy::col2row",
1873 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1874 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1876 if (dommap->isNodeGlobalElement(gid)) {
1877 const local_ordinal_type lc = colmap->getLocalElement(gid);
1878 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1880 BlockHelperDetails::get_msg_prefix(comm) <<
"GID " << gid
1881 <<
" gives an invalid local column.");
1891 const auto local_graph = g.getLocalGraphHost();
1892 const auto local_graph_rowptr = local_graph.row_map;
1893 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
1894 const auto local_graph_colidx = local_graph.entries;
1898 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
1900 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1901 Kokkos::parallel_for
1902 (
"performSymbolicPhase::RangePolicy::lclrow2idx",
1903 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1904 lclrow2idx[lclrow(i)] = i;
1910 typename sum_reducer_type::value_type sum_reducer_value;
1912 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1913 Kokkos::parallel_reduce
1916 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
typename sum_reducer_type::value_type &update) {
1918 const local_ordinal_type ri0 = lclrow2idx[lr];
1919 const local_ordinal_type pi0 = rowidx2part(ri0);
1920 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1921 const local_ordinal_type lc = local_graph_colidx(j);
1922 const local_ordinal_type lc2r = col2row[lc];
1923 bool incr_R =
false;
1925 if (lc2r == (local_ordinal_type) -1) {
1929 const local_ordinal_type ri = lclrow2idx[lc2r];
1930 const local_ordinal_type pi = rowidx2part(ri);
1938 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1944 if (lc < nrows) ++update.v[1];
1948 }, sum_reducer_type(sum_reducer_value));
1950 size_type D_nnz = sum_reducer_value.v[0];
1951 size_type R_nnz_owned = sum_reducer_value.v[1];
1952 size_type R_nnz_remote = sum_reducer_value.v[2];
1954 if (!overlap_communication_and_computation) {
1955 R_nnz_owned += R_nnz_remote;
1961 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1963 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
1964 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1966 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1970 const local_ordinal_type nparts = partptr.extent(0) - 1;
1973 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1974 Kokkos::parallel_for
1975 (
"performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1976 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
1977 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1978 local_ordinal_type offset = 0;
1979 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1980 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1982 const local_ordinal_type lr0 = lclrow(ri0);
1983 const size_type j0 = local_graph_rowptr(lr0);
1984 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1985 const local_ordinal_type lc = local_graph_colidx(j);
1986 const local_ordinal_type lc2r = col2row[lc];
1987 if (lc2r == (local_ordinal_type) -1)
continue;
1988 const local_ordinal_type ri = lclrow2idx[lc2r];
1989 const local_ordinal_type pi = rowidx2part(ri);
1990 if (pi != pi0)
continue;
1991 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
1992 const local_ordinal_type row_entry = j - j0;
1993 D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
1998 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1999 for (
size_t i=0;i<D_A_colindsub.extent(0);++i)
2002 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2006 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
2007 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2008 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
2010 if (interf.n_subparts_per_part > 1) {
2011 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
2012 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2013 btdm.values_schur = vector_type_3d_view(
"btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2016 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2022 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
2023 amd.A_colindsub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub"), R_nnz_owned);
2025 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2026 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2028 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2029 amd.A_colindsub_remote = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub_remote"), R_nnz_remote);
2031 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2032 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2035 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
2036 Kokkos::parallel_for
2037 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2038 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
2039 const local_ordinal_type ri0 = lclrow2idx[lr];
2040 const local_ordinal_type pi0 = rowidx2part(ri0);
2041 const size_type j0 = local_graph_rowptr(lr);
2042 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2043 const local_ordinal_type lc = local_graph_colidx(j);
2044 const local_ordinal_type lc2r = col2row[lc];
2045 if (lc2r != (local_ordinal_type) -1) {
2046 const local_ordinal_type ri = lclrow2idx[lc2r];
2047 const local_ordinal_type pi = rowidx2part(ri);
2048 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2053 if (!overlap_communication_and_computation || lc < nrows) {
2056 ++R_rowptr_remote(lr);
2065 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
2066 Kokkos::parallel_scan
2067 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2068 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
2069 update_type &update,
2070 const bool &
final) {
2072 val.v[0] = R_rowptr(lr);
2073 if (overlap_communication_and_computation)
2074 val.v[1] = R_rowptr_remote(lr);
2077 R_rowptr(lr) = update.v[0];
2078 if (overlap_communication_and_computation)
2079 R_rowptr_remote(lr) = update.v[1];
2082 const local_ordinal_type ri0 = lclrow2idx[lr];
2083 const local_ordinal_type pi0 = rowidx2part(ri0);
2085 size_type cnt_rowptr = R_rowptr(lr);
2086 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
2088 const size_type j0 = local_graph_rowptr(lr);
2089 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2090 const local_ordinal_type lc = local_graph_colidx(j);
2091 const local_ordinal_type lc2r = col2row[lc];
2092 if (lc2r != (local_ordinal_type) -1) {
2093 const local_ordinal_type ri = lclrow2idx[lc2r];
2094 const local_ordinal_type pi = rowidx2part(ri);
2095 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2098 const local_ordinal_type row_entry = j - j0;
2099 if (!overlap_communication_and_computation || lc < nrows)
2100 R_A_colindsub(cnt_rowptr++) = row_entry;
2102 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2110 Kokkos::deep_copy(amd.rowptr, R_rowptr);
2111 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2112 if (overlap_communication_and_computation) {
2114 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2115 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2119 amd.tpetra_values = (
const_cast<block_crs_matrix_type*
>(A.get())->getValuesDeviceNonConst());
2125 if (interf.n_subparts_per_part > 1)
2126 btdm.e_values = vector_type_4d_view(
"btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2128 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2135 template<
typename ArgActiveExecutionMemorySpace>
2140 typedef KB::Mode::Serial mode_type;
2141 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2142 typedef KB::Algo::Level3::CompactMKL algo_type;
2144 typedef KB::Algo::Level3::Blocked algo_type;
2146 static int recommended_team_size(
const int ,
2154 #if defined(KOKKOS_ENABLE_CUDA)
2155 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
2156 const int vector_length,
2157 const int internal_vector_length) {
2158 const int vector_size = vector_length/internal_vector_length;
2159 int total_team_size(0);
2160 if (blksize <= 5) total_team_size = 32;
2161 else if (blksize <= 9) total_team_size = 32;
2162 else if (blksize <= 12) total_team_size = 96;
2163 else if (blksize <= 16) total_team_size = 128;
2164 else if (blksize <= 20) total_team_size = 160;
2165 else total_team_size = 160;
2166 return 2*total_team_size/vector_size;
2169 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2170 typedef KB::Mode::Team mode_type;
2171 typedef KB::Algo::Level3::Unblocked algo_type;
2172 static int recommended_team_size(
const int blksize,
2173 const int vector_length,
2174 const int internal_vector_length) {
2175 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2179 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2180 typedef KB::Mode::Team mode_type;
2181 typedef KB::Algo::Level3::Unblocked algo_type;
2182 static int recommended_team_size(
const int blksize,
2183 const int vector_length,
2184 const int internal_vector_length) {
2185 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2190 #if defined(KOKKOS_ENABLE_HIP)
2191 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(
const int blksize,
2192 const int vector_length,
2193 const int internal_vector_length) {
2194 const int vector_size = vector_length/internal_vector_length;
2195 int total_team_size(0);
2196 if (blksize <= 5) total_team_size = 32;
2197 else if (blksize <= 9) total_team_size = 32;
2198 else if (blksize <= 12) total_team_size = 96;
2199 else if (blksize <= 16) total_team_size = 128;
2200 else if (blksize <= 20) total_team_size = 160;
2201 else total_team_size = 160;
2202 return 2*total_team_size/vector_size;
2205 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2206 typedef KB::Mode::Team mode_type;
2207 typedef KB::Algo::Level3::Unblocked algo_type;
2208 static int recommended_team_size(
const int blksize,
2209 const int vector_length,
2210 const int internal_vector_length) {
2211 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2215 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2216 typedef KB::Mode::Team mode_type;
2217 typedef KB::Algo::Level3::Unblocked algo_type;
2218 static int recommended_team_size(
const int blksize,
2219 const int vector_length,
2220 const int internal_vector_length) {
2221 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2226 #if defined(KOKKOS_ENABLE_SYCL)
2227 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(
const int blksize,
2228 const int vector_length,
2229 const int internal_vector_length) {
2230 const int vector_size = vector_length/internal_vector_length;
2231 int total_team_size(0);
2232 if (blksize <= 5) total_team_size = 32;
2233 else if (blksize <= 9) total_team_size = 32;
2234 else if (blksize <= 12) total_team_size = 96;
2235 else if (blksize <= 16) total_team_size = 128;
2236 else if (blksize <= 20) total_team_size = 160;
2237 else total_team_size = 160;
2238 return 2*total_team_size/vector_size;
2241 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2242 typedef KB::Mode::Team mode_type;
2243 typedef KB::Algo::Level3::Unblocked algo_type;
2244 static int recommended_team_size(
const int blksize,
2245 const int vector_length,
2246 const int internal_vector_length) {
2247 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2251 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2252 typedef KB::Mode::Team mode_type;
2253 typedef KB::Algo::Level3::Unblocked algo_type;
2254 static int recommended_team_size(
const int blksize,
2255 const int vector_length,
2256 const int internal_vector_length) {
2257 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2262 template<
typename impl_type,
typename WWViewType>
2263 KOKKOS_INLINE_FUNCTION
2265 solveMultiVector(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2266 const typename impl_type::local_ordinal_type &,
2267 const typename impl_type::local_ordinal_type &i0,
2268 const typename impl_type::local_ordinal_type &r0,
2269 const typename impl_type::local_ordinal_type &nrows,
2270 const typename impl_type::local_ordinal_type &v,
2271 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2272 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2273 const WWViewType &WW,
2274 const bool skip_first_pass=
false) {
2275 using execution_space =
typename impl_type::execution_space;
2276 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2277 using member_type =
typename team_policy_type::member_type;
2278 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2280 typedef SolveTridiagsDefaultModeAndAlgo
2281 <
typename execution_space::memory_space> default_mode_and_algo_type;
2283 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2284 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2286 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2289 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2290 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2293 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2294 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2297 local_ordinal_type i = i0, r = r0;
2302 if (skip_first_pass) {
2305 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2306 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2307 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2308 KB::Trsm<member_type,
2309 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2310 default_mode_type,default_algo_type>
2311 ::invoke(member, one, A, X2);
2312 X1.assign_data( X2.data() );
2316 KB::Trsm<member_type,
2317 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2318 default_mode_type,default_algo_type>
2319 ::invoke(member, one, A, X1);
2320 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2321 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2322 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2323 member.team_barrier();
2324 KB::Gemm<member_type,
2325 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2326 default_mode_type,default_algo_type>
2327 ::invoke(member, -one, A, X1, one, X2);
2328 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2329 KB::Trsm<member_type,
2330 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2331 default_mode_type,default_algo_type>
2332 ::invoke(member, one, A, X2);
2333 X1.assign_data( X2.data() );
2338 KB::Trsm<member_type,
2339 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2340 default_mode_type,default_algo_type>
2341 ::invoke(member, one, A, X1);
2342 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2344 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2345 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2346 member.team_barrier();
2347 KB::Gemm<member_type,
2348 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2349 default_mode_type,default_algo_type>
2350 ::invoke(member, -one, A, X1, one, X2);
2352 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2353 KB::Trsm<member_type,
2354 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2355 default_mode_type,default_algo_type>
2356 ::invoke(member, one, A, X2);
2357 X1.assign_data( X2.data() );
2361 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2362 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2363 ::invoke(member, X1, W);
2364 member.team_barrier();
2365 KB::Gemm<member_type,
2366 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2367 default_mode_type,default_algo_type>
2368 ::invoke(member, one, A, W, zero, X1);
2373 template<
typename impl_type,
typename WWViewType,
typename XViewType>
2374 KOKKOS_INLINE_FUNCTION
2376 solveSingleVectorNew(
const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2377 const typename impl_type::local_ordinal_type &blocksize,
2378 const typename impl_type::local_ordinal_type &i0,
2379 const typename impl_type::local_ordinal_type &r0,
2380 const typename impl_type::local_ordinal_type &nrows,
2381 const typename impl_type::local_ordinal_type &v,
2382 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2383 const XViewType &X_internal_vector_values,
2384 const WWViewType &WW) {
2385 using execution_space =
typename impl_type::execution_space;
2388 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2390 typedef SolveTridiagsDefaultModeAndAlgo
2391 <
typename execution_space::memory_space> default_mode_and_algo_type;
2393 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2394 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2396 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2399 auto A = D_internal_vector_values.data();
2400 auto X = X_internal_vector_values.data();
2403 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2404 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2408 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2409 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
2410 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
2411 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2412 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
2421 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2422 (default_mode_type,default_algo_type,
2425 blocksize,blocksize,
2430 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2431 member.team_barrier();
2432 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2433 (default_mode_type,default_algo_type,
2435 blocksize, blocksize,
2437 A+2*astep, as0, as1,
2441 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2442 (default_mode_type,default_algo_type,
2445 blocksize,blocksize,
2447 A+3*astep, as0, as1,
2455 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2456 (default_mode_type,default_algo_type,
2459 blocksize, blocksize,
2464 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2466 member.team_barrier();
2467 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2468 (default_mode_type,default_algo_type,
2470 blocksize, blocksize,
2472 A+1*astep, as0, as1,
2476 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2477 (default_mode_type,default_algo_type,
2480 blocksize, blocksize,
2489 const local_ordinal_type ws0 = WW.stride_0();
2490 auto W = WW.data() + v;
2491 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2493 member, blocksize, X, xs0, W, ws0);
2494 member.team_barrier();
2495 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2496 (default_mode_type,default_algo_type,
2498 blocksize, blocksize,
2507 template<
typename local_ordinal_type,
typename ViewType>
2508 void writeBTDValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2509 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2510 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2511 std::ofstream myfile;
2512 myfile.open (fileName);
2514 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2515 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2516 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2517 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2519 const local_ordinal_type block_size = scalar_values.extent(1);
2521 const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
2522 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2524 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2526 myfile <<
"%%MatrixMarket matrix coordinate real general"<< std::endl;
2527 myfile <<
"%%nnz = " << nnz;
2528 myfile <<
" block size = " << block_size;
2529 myfile <<
" number of blocks = " << n_blocks;
2530 myfile <<
" number of parts = " << n_parts;
2531 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2532 myfile <<
" number of rows = " << n_rows ;
2533 myfile <<
" number of cols = " << n_rows;
2534 myfile <<
" number of packs = " << n_packs << std::endl;
2536 myfile << n_rows <<
" " << n_rows <<
" " << nnz << std::setprecision(9) << std::endl;
2538 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2539 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2540 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2541 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2542 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2543 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2544 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2546 if (i_block_in_part % 3 == 0) {
2547 current_row_offset = i_block_in_part/3 * block_size;
2548 current_col_offset = i_block_in_part/3 * block_size;
2550 else if (i_block_in_part % 3 == 1) {
2551 current_row_offset = (i_block_in_part-1)/3 * block_size;
2552 current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
2554 else if (i_block_in_part % 3 == 2) {
2555 current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
2556 current_col_offset = (i_block_in_part-2)/3 * block_size;
2558 current_row_offset += current_part_idx * n_rows_per_part;
2559 current_col_offset += current_part_idx * n_rows_per_part;
2560 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2561 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2562 current_row = current_row_offset + i_in_block + 1;
2563 current_col = current_col_offset + j_in_block + 1;
2564 myfile << current_row <<
" " << current_col <<
" " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2575 template<
typename local_ordinal_type,
typename ViewType>
2576 void write4DMultiVectorValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2577 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2578 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2579 std::ofstream myfile;
2580 myfile.open (fileName);
2582 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2583 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2584 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2586 const local_ordinal_type block_size = scalar_values.extent(1);
2587 const local_ordinal_type n_cols = scalar_values.extent(2);
2589 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2590 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2592 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2595 myfile <<
"%%MatrixMarket matrix array real general"<< std::endl;
2596 myfile <<
"%%block size = " << block_size;
2597 myfile <<
" number of blocks = " << n_blocks;
2598 myfile <<
" number of parts = " << n_parts;
2599 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2600 myfile <<
" number of rows = " << n_rows ;
2601 myfile <<
" number of cols = " << n_cols;
2602 myfile <<
" number of packs = " << n_packs << std::endl;
2604 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2606 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2607 (void) current_row_offset;
2608 (void) current_part_idx;
2609 for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {
2610 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2611 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2612 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2613 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2614 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2616 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2618 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2619 myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2629 template<
typename local_ordinal_type,
typename ViewType>
2630 void write5DMultiVectorValuesToFile (
const local_ordinal_type &n_parts,
const ViewType &scalar_values_device, std::string fileName) {
2631 #ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2632 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2633 std::ofstream myfile;
2634 myfile.open (fileName);
2636 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2637 const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
2638 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2640 const local_ordinal_type block_size = scalar_values.extent(2);
2641 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2642 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2644 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2645 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2647 const local_ordinal_type n_packs = ceil(
float(n_parts)/n_parts_per_pack);
2649 myfile <<
"%%MatrixMarket matrix array real general"<< std::endl;
2650 myfile <<
"%%block size = " << block_size;
2651 myfile <<
" number of blocks = " << n_blocks;
2652 myfile <<
" number of parts = " << n_parts;
2653 myfile <<
" number of blocks per part = " << n_blocks_per_part;
2654 myfile <<
" number of rows = " << n_rows ;
2655 myfile <<
" number of cols = " << n_cols;
2656 myfile <<
" number of packs = " << n_packs << std::endl;
2658 myfile << n_rows <<
" " << n_cols << std::setprecision(9) << std::endl;
2660 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2661 (void) current_row_offset;
2662 (void) current_part_idx;
2663 for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
2664 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2665 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2666 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2667 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2668 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2669 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2671 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(1))
2673 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2674 myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2685 template<
typename local_ordinal_type,
typename member_type,
typename ViewType1,
typename ViewType2>
2686 KOKKOS_INLINE_FUNCTION
2688 copy3DView(
const member_type &member,
const ViewType1 &view1,
const ViewType2 &view2) {
2701 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2703 template<
typename MatrixType>
2704 struct ExtractAndFactorizeTridiags {
2706 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2708 using execution_space =
typename impl_type::execution_space;
2709 using memory_space =
typename impl_type::memory_space;
2711 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2714 using magnitude_type =
typename impl_type::magnitude_type;
2716 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
2718 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2719 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
2720 using size_type_2d_view =
typename impl_type::size_type_2d_view;
2721 using impl_scalar_type_1d_view_tpetra =
typename impl_type::impl_scalar_type_1d_view_tpetra;
2723 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2724 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2725 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2726 using vector_type_4d_view =
typename impl_type::vector_type_4d_view;
2727 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
2728 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
2729 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
2730 using btdm_scalar_type_5d_view =
typename impl_type::btdm_scalar_type_5d_view;
2731 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2732 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2734 using internal_vector_type =
typename impl_type::internal_vector_type;
2735 static constexpr
int vector_length = impl_type::vector_length;
2736 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
2739 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2740 using member_type =
typename team_policy_type::member_type;
2744 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2745 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2746 const local_ordinal_type max_partsz;
2748 using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2749 const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
2750 const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2752 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2753 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2754 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2755 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2756 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2757 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2759 const local_ordinal_type blocksize, blocksize_square;
2761 const magnitude_type tiny;
2762 const local_ordinal_type vector_loop_size;
2763 const local_ordinal_type vector_length_value;
2766 ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
2767 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2769 const magnitude_type& tiny_) :
2771 partptr(interf_.partptr),
2772 lclrow(interf_.lclrow),
2773 packptr(interf_.packptr),
2774 packindices_sub(interf_.packindices_sub),
2775 packptr_sub(interf_.packptr_sub),
2776 partptr_sub(interf_.partptr_sub),
2777 part2packrowidx0_sub(interf_.part2packrowidx0_sub),
2778 packindices_schur(interf_.packindices_schur),
2779 max_partsz(interf_.max_partsz),
2781 A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
2782 A_values(const_cast<block_crs_matrix_type*>(A_.get())->getValuesDeviceNonConst()),
2784 pack_td_ptr(btdm_.pack_td_ptr),
2785 flat_td_ptr(btdm_.flat_td_ptr),
2786 pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
2787 A_colindsub(btdm_.A_colindsub),
2788 internal_vector_values((internal_vector_type*)btdm_.values.data(),
2789 btdm_.values.extent(0),
2790 btdm_.values.extent(1),
2791 btdm_.values.extent(2),
2792 vector_length/internal_vector_length),
2793 internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
2794 btdm_.values_schur.extent(0),
2795 btdm_.values_schur.extent(1),
2796 btdm_.values_schur.extent(2),
2797 vector_length/internal_vector_length),
2798 e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
2799 btdm_.e_values.extent(0),
2800 btdm_.e_values.extent(1),
2801 btdm_.e_values.extent(2),
2802 btdm_.e_values.extent(3),
2803 vector_length/internal_vector_length),
2804 scalar_values((btdm_scalar_type*)btdm_.values.data(),
2805 btdm_.values.extent(0),
2806 btdm_.values.extent(1),
2807 btdm_.values.extent(2),
2809 scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
2810 btdm_.values_schur.extent(0),
2811 btdm_.values_schur.extent(1),
2812 btdm_.values_schur.extent(2),
2814 e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
2815 btdm_.e_values.extent(0),
2816 btdm_.e_values.extent(1),
2817 btdm_.e_values.extent(2),
2818 btdm_.e_values.extent(3),
2820 blocksize(btdm_.values.extent(1)),
2821 blocksize_square(blocksize*blocksize),
2824 vector_loop_size(vector_length/internal_vector_length),
2825 vector_length_value(vector_length) {}
2829 KOKKOS_INLINE_FUNCTION
2831 extract(local_ordinal_type partidx,
2832 local_ordinal_type local_subpartidx,
2833 local_ordinal_type npacks)
const {
2834 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2835 printf(
"extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2837 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2838 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2839 local_ordinal_type kfs[vector_length] = {};
2840 local_ordinal_type ri0[vector_length] = {};
2841 local_ordinal_type nrows[vector_length] = {};
2843 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2844 kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
2845 ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
2846 nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
2847 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2848 printf(
"kfs[%d] = %d;\n", vi, kfs[vi]);
2849 printf(
"ri0[%d] = %d;\n", vi, ri0[vi]);
2850 printf(
"nrows[%d] = %d;\n", vi, nrows[vi]);
2853 local_ordinal_type tr_min = 0;
2854 local_ordinal_type tr_max = nrows[0];
2855 if (local_subpartidx % 2 == 1) {
2859 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2860 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2862 for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
2863 for (local_ordinal_type e=0;e<3;++e) {
2864 const impl_scalar_type* block[vector_length] = {};
2865 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2866 const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2867 block[vi] = &A_values(Aj*blocksize_square);
2869 const size_type pi = kps + j;
2870 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2871 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2874 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2875 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2876 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2877 auto& v = internal_vector_values(pi, ii, jj, 0);
2878 for (local_ordinal_type vi=0;vi<npacks;++vi)
2879 v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2883 if (nrows[0] == 1)
break;
2884 if (local_subpartidx % 2 == 0) {
2885 if (e == 1 && (tr == 0 || tr+1 == nrows[0]))
break;
2886 for (local_ordinal_type vi=1;vi<npacks;++vi) {
2887 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2894 if (e == 0 && (tr == -1 || tr == nrows[0]))
break;
2895 for (local_ordinal_type vi=1;vi<npacks;++vi) {
2896 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
2906 KOKKOS_INLINE_FUNCTION
2908 extract(
const member_type &member,
2909 const local_ordinal_type &partidxbeg,
2910 local_ordinal_type local_subpartidx,
2911 const local_ordinal_type &npacks,
2912 const local_ordinal_type &vbeg)
const {
2913 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2914 printf(
"extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
2916 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2917 local_ordinal_type kfs_vals[internal_vector_length] = {};
2918 local_ordinal_type ri0_vals[internal_vector_length] = {};
2919 local_ordinal_type nrows_vals[internal_vector_length] = {};
2921 const size_type kps = pack_td_ptr(partidxbeg,local_subpartidx);
2922 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2923 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,local_subpartidx);
2924 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,0);
2925 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,1) - ri0_vals[vi];
2926 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2927 printf(
"kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
2928 printf(
"ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
2929 printf(
"nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
2933 local_ordinal_type j_vals[internal_vector_length] = {};
2935 local_ordinal_type tr_min = 0;
2936 local_ordinal_type tr_max = nrows_vals[0];
2937 if (local_subpartidx % 2 == 1) {
2941 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2942 printf(
"tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2944 for (local_ordinal_type tr=tr_min;tr<tr_max;++tr) {
2945 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
2946 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
2947 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows+1)) {
2948 auto &j = j_vals[vi];
2949 const local_ordinal_type kfs = kfs_vals[vi];
2950 const local_ordinal_type ri0 = ri0_vals[vi];
2951 local_ordinal_type lbeg, lend;
2952 if (local_subpartidx % 2 == 0) {
2953 lbeg = (tr == tr_min ? 1 : 0);
2954 lend = (tr == nrows - 1 ? 2 : 3);
2963 else if (tr == nrows) {
2968 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
2969 const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
2970 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
2971 const size_type pi = kps + j;
2972 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2973 printf(
"Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
2975 Kokkos::parallel_for
2976 (Kokkos::TeamThreadRange(member,blocksize),
2977 [&](
const local_ordinal_type &ii) {
2978 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2979 scalar_values(pi, ii, jj, v) =
static_cast<btdm_scalar_type
>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
2988 template<
typename AAViewType,
2989 typename WWViewType>
2990 KOKKOS_INLINE_FUNCTION
2992 factorize_subline(
const member_type &member,
2993 const local_ordinal_type &i0,
2994 const local_ordinal_type &nrows,
2995 const local_ordinal_type &v,
2996 const AAViewType &AA,
2997 const WWViewType &WW)
const {
2999 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3000 <
typename execution_space::memory_space> default_mode_and_algo_type;
3002 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3003 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3006 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3008 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3009 printf(
"i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3013 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3015 default_mode_type,KB::Algo::LU::Unblocked>
3016 ::invoke(member, A , tiny);
3021 local_ordinal_type i = i0;
3022 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
3023 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3024 printf(
"tr = %d, i = %d;\n", tr, i);
3026 B.assign_data( &AA(i+1,0,0,v) );
3027 KB::Trsm<member_type,
3028 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3029 default_mode_type,default_algo_type>
3030 ::invoke(member, one, A, B);
3031 C.assign_data( &AA(i+2,0,0,v) );
3032 KB::Trsm<member_type,
3033 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3034 default_mode_type,default_algo_type>
3035 ::invoke(member, one, A, C);
3036 A.assign_data( &AA(i+3,0,0,v) );
3038 member.team_barrier();
3039 KB::Gemm<member_type,
3040 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3041 default_mode_type,default_algo_type>
3042 ::invoke(member, -one, C, B, one, A);
3044 default_mode_type,KB::Algo::LU::Unblocked>
3045 ::invoke(member, A, tiny);
3049 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3050 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
3051 ::invoke(member, A, W);
3052 KB::SetIdentity<member_type,default_mode_type>
3053 ::invoke(member, A);
3054 member.team_barrier();
3055 KB::Trsm<member_type,
3056 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3057 default_mode_type,default_algo_type>
3058 ::invoke(member, one, W, A);
3059 KB::Trsm<member_type,
3060 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3061 default_mode_type,default_algo_type>
3062 ::invoke(member, one, W, A);
3068 struct ExtractAndFactorizeSubLineTag {};
3069 struct ExtractBCDTag {};
3070 struct ComputeETag {};
3071 struct ComputeSchurTag {};
3072 struct FactorizeSchurTag {};
3074 KOKKOS_INLINE_FUNCTION
3076 operator() (
const ExtractAndFactorizeSubLineTag &,
const member_type &member)
const {
3078 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3080 const local_ordinal_type subpartidx = packptr_sub(packidx);
3081 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3082 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3083 const local_ordinal_type partidx = subpartidx%n_parts;
3085 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3086 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3087 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3089 internal_vector_scratch_type_3d_view
3090 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
3092 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3093 printf(
"rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3094 printf(
"vector_loop_size = %d\n", vector_loop_size);
3097 if (vector_loop_size == 1) {
3098 extract(partidx, local_subpartidx, npacks);
3099 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3101 Kokkos::parallel_for
3102 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3103 [&](
const local_ordinal_type &v) {
3104 const local_ordinal_type vbeg = v*internal_vector_length;
3105 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3106 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3109 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3112 member.team_barrier();
3113 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3118 KOKKOS_INLINE_FUNCTION
3120 operator() (
const ExtractBCDTag &,
const member_type &member)
const {
3122 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3123 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3124 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3126 const local_ordinal_type subpartidx = packptr_sub(packidx);
3127 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3128 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3129 const local_ordinal_type partidx = subpartidx%n_parts;
3131 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3135 if (vector_loop_size == 1) {
3136 extract(partidx, local_subpartidx, npacks);
3139 Kokkos::parallel_for
3140 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3141 [&](
const local_ordinal_type &v) {
3142 const local_ordinal_type vbeg = v*internal_vector_length;
3143 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3144 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3145 printf(
"i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3148 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3152 member.team_barrier();
3154 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3155 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
3157 const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3158 const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3160 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3161 printf(
"Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3165 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3166 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3168 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3169 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3173 KOKKOS_INLINE_FUNCTION
3175 operator() (
const ComputeETag &,
const member_type &member)
const {
3177 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3179 const local_ordinal_type subpartidx = packptr_sub(packidx);
3180 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3181 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3182 const local_ordinal_type partidx = subpartidx%n_parts;
3184 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3185 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3186 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3187 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3188 const local_ordinal_type num_vectors = blocksize;
3192 internal_vector_scratch_type_3d_view
3193 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
3194 if (local_subpartidx == 0) {
3195 Kokkos::parallel_for
3196 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3197 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3200 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
3201 Kokkos::parallel_for
3202 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3203 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3207 Kokkos::parallel_for
3208 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3209 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW,
true);
3210 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3215 KOKKOS_INLINE_FUNCTION
3217 operator() (
const ComputeSchurTag &,
const member_type &member)
const {
3219 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3220 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3221 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3223 const local_ordinal_type subpartidx = packptr_sub(packidx);
3224 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3225 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3226 const local_ordinal_type partidx = subpartidx%n_parts;
3229 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3233 internal_vector_scratch_type_3d_view
3234 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
3238 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
3239 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
3240 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
3242 for (local_ordinal_type i = 0; i < 4; ++i) {
3243 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3244 Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3247 member.team_barrier();
3249 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3251 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
3252 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
3254 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3255 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3257 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3258 <
typename execution_space::memory_space> default_mode_and_algo_type;
3260 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3261 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3263 Kokkos::parallel_for
3264 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
3265 for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
3266 local_ordinal_type e_r, e_c, c_kps;
3268 if ( local_subpartidx_schur == 0 ) {
3274 else if ( i == 3 ) {
3279 else if ( i == 4 ) {
3294 else if ( i == 1 ) {
3299 else if ( i == 4 ) {
3304 else if ( i == 5 ) {
3314 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
3315 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3316 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3317 KB::Gemm<member_type,
3318 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3319 default_mode_type,default_algo_type>
3320 ::invoke(member, -one, C, E, one, S);
3325 KOKKOS_INLINE_FUNCTION
3327 operator() (
const FactorizeSchurTag &,
const member_type &member)
const {
3328 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3330 const local_ordinal_type subpartidx = packptr_sub(packidx);
3332 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3333 const local_ordinal_type partidx = subpartidx%n_parts;
3335 const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
3336 const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
3338 internal_vector_scratch_type_3d_view
3339 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
3341 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3342 printf(
"FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3345 if (vector_loop_size == 1) {
3346 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3348 Kokkos::parallel_for
3349 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3350 [&](
const local_ordinal_type &v) {
3351 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3357 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3358 const local_ordinal_type team_size =
3359 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3360 recommended_team_size(blocksize, vector_length, internal_vector_length);
3361 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3362 shmem_size(blocksize, blocksize, vector_loop_size);
3365 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3366 printf(
"Start ExtractAndFactorizeSubLineTag\n");
3368 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag");
3369 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
3370 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3373 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3374 writeBTDValuesToFile(n_parts, scalar_values,
"before.mm");
3376 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3377 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3379 execution_space().fence();
3381 writeBTDValuesToFile(n_parts, scalar_values,
"after.mm");
3382 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3383 printf(
"End ExtractAndFactorizeSubLineTag\n");
3387 if (packindices_schur.extent(1) > 0)
3390 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3391 printf(
"Start ExtractBCDTag\n");
3393 Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3394 Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3396 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_before_extract.mm");
3399 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ExtractBCDTag");
3400 Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
3401 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3403 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3404 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3406 execution_space().fence();
3409 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3410 printf(
"End ExtractBCDTag\n");
3412 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values,
"after_extraction_of_BCD.mm");
3413 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3414 printf(
"Start ComputeETag\n");
3416 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_extract.mm");
3418 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeETag");
3419 Kokkos::TeamPolicy<execution_space,ComputeETag>
3420 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3422 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3423 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3425 execution_space().fence();
3427 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values,
"e_scalar_values_after_compute.mm");
3429 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3430 printf(
"End ComputeETag\n");
3435 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3436 printf(
"Start ComputeSchurTag\n");
3438 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::ComputeSchurTag");
3439 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"before_schur.mm");
3440 Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
3441 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3443 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3444 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3446 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_schur.mm");
3447 execution_space().fence();
3448 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3449 printf(
"End ComputeSchurTag\n");
3454 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3455 printf(
"Start FactorizeSchurTag\n");
3457 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase::FactorizeSchurTag");
3458 Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
3459 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3460 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
3461 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3463 execution_space().fence();
3464 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur,
"after_factor_schur.mm");
3465 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3466 printf(
"End FactorizeSchurTag\n");
3471 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3479 template<
typename MatrixType>
3482 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3484 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
3485 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::NumericPhase");
3486 ExtractAndFactorizeTridiags<MatrixType>
function(btdm, interf, A, tiny);
3488 IFPACK2_BLOCKHELPER_TIMER_FENCE(
typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3494 template<
typename MatrixType>
3498 using execution_space =
typename impl_type::execution_space;
3499 using memory_space =
typename impl_type::memory_space;
3501 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3503 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3504 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
3505 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3506 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3507 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3508 using const_impl_scalar_type_2d_view_tpetra =
typename impl_scalar_type_2d_view_tpetra::const_type;
3509 static constexpr
int vector_length = impl_type::vector_length;
3511 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
3515 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3516 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3517 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3518 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3519 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3520 const local_ordinal_type blocksize;
3521 const local_ordinal_type num_vectors;
3524 vector_type_3d_view packed_multivector;
3525 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3527 template<
typename TagType>
3528 KOKKOS_INLINE_FUNCTION
3529 void copy_multivectors(
const local_ordinal_type &j,
3530 const local_ordinal_type &vi,
3531 const local_ordinal_type &pri,
3532 const local_ordinal_type &ri0)
const {
3533 for (local_ordinal_type col=0;col<num_vectors;++col)
3534 for (local_ordinal_type i=0;i<blocksize;++i)
3535 packed_multivector(pri, i, col)[vi] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3541 const vector_type_3d_view &pmv)
3542 : partptr(interf.partptr),
3543 packptr(interf.packptr),
3544 part2packrowidx0(interf.part2packrowidx0),
3545 part2rowidx0(interf.part2rowidx0),
3546 lclrow(interf.lclrow),
3547 blocksize(pmv.extent(1)),
3548 num_vectors(pmv.extent(2)),
3549 packed_multivector(pmv) {}
3553 KOKKOS_INLINE_FUNCTION
3555 operator() (
const local_ordinal_type &packidx)
const {
3556 local_ordinal_type partidx = packptr(packidx);
3557 local_ordinal_type npacks = packptr(packidx+1) - partidx;
3558 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3560 local_ordinal_type ri0[vector_length] = {};
3561 local_ordinal_type nrows[vector_length] = {};
3562 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
3563 ri0[v] = part2rowidx0(partidx);
3564 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
3566 for (local_ordinal_type j=0;j<nrows[0];++j) {
3567 local_ordinal_type cnt = 1;
3568 for (;cnt<npacks && j!= nrows[cnt];++cnt);
3570 const local_ordinal_type pri = pri0 + j;
3571 for (local_ordinal_type col=0;col<num_vectors;++col)
3572 for (local_ordinal_type i=0;i<blocksize;++i)
3573 for (local_ordinal_type v=0;v<npacks;++v)
3574 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
3578 KOKKOS_INLINE_FUNCTION
3580 operator() (
const member_type &member)
const {
3581 const local_ordinal_type packidx = member.league_rank();
3582 const local_ordinal_type partidx_begin = packptr(packidx);
3583 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
3584 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3585 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
3586 const local_ordinal_type partidx = partidx_begin + v;
3587 const local_ordinal_type ri0 = part2rowidx0(partidx);
3588 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
3591 const local_ordinal_type pri = pri0;
3592 for (local_ordinal_type col=0;col<num_vectors;++col) {
3593 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
3594 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
3598 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
3599 const local_ordinal_type pri = pri0 + j;
3600 for (local_ordinal_type col=0;col<num_vectors;++col)
3601 for (local_ordinal_type i=0;i<blocksize;++i)
3602 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3608 void run(
const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3609 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3610 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::MultiVectorConverter");
3612 scalar_multivector = scalar_multivector_;
3613 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3614 const local_ordinal_type vl = vector_length;
3615 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3616 Kokkos::parallel_for
3617 (
"MultiVectorConverter::TeamPolicy", policy, *
this);
3619 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3620 Kokkos::parallel_for
3621 (
"MultiVectorConverter::RangePolicy", policy, *
this);
3623 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3624 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3633 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3634 typedef KB::Mode::Serial mode_type;
3635 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3636 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3637 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3639 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3641 static int recommended_team_size(
const int ,
3648 #if defined(KOKKOS_ENABLE_CUDA)
3649 static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
3650 const int vector_length,
3651 const int internal_vector_length) {
3652 const int vector_size = vector_length/internal_vector_length;
3653 int total_team_size(0);
3654 if (blksize <= 5) total_team_size = 32;
3655 else if (blksize <= 9) total_team_size = 32;
3656 else if (blksize <= 12) total_team_size = 96;
3657 else if (blksize <= 16) total_team_size = 128;
3658 else if (blksize <= 20) total_team_size = 160;
3659 else total_team_size = 160;
3660 return total_team_size/vector_size;
3664 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3665 typedef KB::Mode::Team mode_type;
3666 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3667 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3668 static int recommended_team_size(
const int blksize,
3669 const int vector_length,
3670 const int internal_vector_length) {
3671 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3675 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3676 typedef KB::Mode::Team mode_type;
3677 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3678 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3679 static int recommended_team_size(
const int blksize,
3680 const int vector_length,
3681 const int internal_vector_length) {
3682 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3687 #if defined(KOKKOS_ENABLE_HIP)
3688 static inline int SolveTridiagsRecommendedHIPTeamSize(
const int blksize,
3689 const int vector_length,
3690 const int internal_vector_length) {
3691 const int vector_size = vector_length/internal_vector_length;
3692 int total_team_size(0);
3693 if (blksize <= 5) total_team_size = 32;
3694 else if (blksize <= 9) total_team_size = 32;
3695 else if (blksize <= 12) total_team_size = 96;
3696 else if (blksize <= 16) total_team_size = 128;
3697 else if (blksize <= 20) total_team_size = 160;
3698 else total_team_size = 160;
3699 return total_team_size/vector_size;
3703 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3704 typedef KB::Mode::Team mode_type;
3705 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3706 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3707 static int recommended_team_size(
const int blksize,
3708 const int vector_length,
3709 const int internal_vector_length) {
3710 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3714 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3715 typedef KB::Mode::Team mode_type;
3716 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3717 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3718 static int recommended_team_size(
const int blksize,
3719 const int vector_length,
3720 const int internal_vector_length) {
3721 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3726 #if defined(KOKKOS_ENABLE_SYCL)
3727 static inline int SolveTridiagsRecommendedSYCLTeamSize(
const int blksize,
3728 const int vector_length,
3729 const int internal_vector_length) {
3730 const int vector_size = vector_length/internal_vector_length;
3731 int total_team_size(0);
3732 if (blksize <= 5) total_team_size = 32;
3733 else if (blksize <= 9) total_team_size = 32;
3734 else if (blksize <= 12) total_team_size = 96;
3735 else if (blksize <= 16) total_team_size = 128;
3736 else if (blksize <= 20) total_team_size = 160;
3737 else total_team_size = 160;
3738 return total_team_size/vector_size;
3742 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
3743 typedef KB::Mode::Team mode_type;
3744 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3745 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3746 static int recommended_team_size(
const int blksize,
3747 const int vector_length,
3748 const int internal_vector_length) {
3749 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3753 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
3754 typedef KB::Mode::Team mode_type;
3755 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3756 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3757 static int recommended_team_size(
const int blksize,
3758 const int vector_length,
3759 const int internal_vector_length) {
3760 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3768 template<
typename MatrixType>
3769 struct SolveTridiags {
3771 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3772 using execution_space =
typename impl_type::execution_space;
3774 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3777 using magnitude_type =
typename impl_type::magnitude_type;
3778 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
3779 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
3781 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
3782 using local_ordinal_type_2d_view =
typename impl_type::local_ordinal_type_2d_view;
3783 using size_type_2d_view =
typename impl_type::size_type_2d_view;
3785 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
3786 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
3787 using internal_vector_type_5d_view =
typename impl_type::internal_vector_type_5d_view;
3788 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
3790 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3792 using internal_vector_type =
typename impl_type::internal_vector_type;
3793 static constexpr
int vector_length = impl_type::vector_length;
3794 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
3797 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
3798 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
3801 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3802 using member_type =
typename team_policy_type::member_type;
3806 local_ordinal_type n_subparts_per_part;
3807 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3808 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3809 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
3810 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
3811 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3812 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
3813 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3814 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
3816 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
3817 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
3820 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
3823 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
3824 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
3825 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
3827 internal_vector_type_4d_view X_internal_vector_values_schur;
3829 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
3830 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
3833 const local_ordinal_type vector_loop_size;
3836 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
3837 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
3838 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3840 Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3842 const impl_scalar_type df;
3843 const bool compute_diff;
3846 SolveTridiags(
const BlockHelperDetails::PartInterface<MatrixType> &interf,
3847 const BlockTridiags<MatrixType> &btdm,
3848 const vector_type_3d_view &pmv,
3849 const impl_scalar_type damping_factor,
3850 const bool is_norm_manager_active)
3853 n_subparts_per_part(interf.n_subparts_per_part),
3854 partptr(interf.partptr),
3855 packptr(interf.packptr),
3856 packindices_sub(interf.packindices_sub),
3857 packindices_schur(interf.packindices_schur),
3858 part2packrowidx0(interf.part2packrowidx0),
3859 part2packrowidx0_sub(interf.part2packrowidx0_sub),
3860 lclrow(interf.lclrow),
3861 packptr_sub(interf.packptr_sub),
3862 partptr_sub(interf.partptr_sub),
3863 pack_td_ptr_schur(btdm.pack_td_ptr_schur),
3865 pack_td_ptr(btdm.pack_td_ptr),
3866 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
3867 btdm.values.extent(0),
3868 btdm.values.extent(1),
3869 btdm.values.extent(2),
3870 vector_length/internal_vector_length),
3871 X_internal_vector_values((internal_vector_type*)pmv.data(),
3875 vector_length/internal_vector_length),
3876 X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
3882 2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
3885 vector_length/internal_vector_length),
3886 D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
3887 btdm.values_schur.extent(0),
3888 btdm.values_schur.extent(1),
3889 btdm.values_schur.extent(2),
3890 vector_length/internal_vector_length),
3891 e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
3892 btdm.e_values.extent(0),
3893 btdm.e_values.extent(1),
3894 btdm.e_values.extent(2),
3895 btdm.e_values.extent(3),
3896 vector_length/internal_vector_length),
3897 vector_loop_size(vector_length/internal_vector_length),
3898 Y_scalar_multivector(),
3901 compute_diff(is_norm_manager_active)
3907 KOKKOS_INLINE_FUNCTION
3909 copyToFlatMultiVector(
const member_type &member,
3910 const local_ordinal_type partidxbeg,
3911 const local_ordinal_type npacks,
3912 const local_ordinal_type pri0,
3913 const local_ordinal_type v,
3914 const local_ordinal_type blocksize,
3915 const local_ordinal_type num_vectors)
const {
3916 const local_ordinal_type vbeg = v*internal_vector_length;
3917 if (vbeg < npacks) {
3918 local_ordinal_type ri0_vals[internal_vector_length] = {};
3919 local_ordinal_type nrows_vals[internal_vector_length] = {};
3920 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
3921 const local_ordinal_type partidx = partidxbeg+vv;
3922 ri0_vals[vi] = partptr(partidx);
3923 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
3926 impl_scalar_type z_partial_sum(0);
3927 if (nrows_vals[0] == 1) {
3928 const local_ordinal_type j=0, pri=pri0;
3930 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
3931 const local_ordinal_type ri0 = ri0_vals[vi];
3932 const local_ordinal_type nrows = nrows_vals[vi];
3934 Kokkos::parallel_for
3935 (Kokkos::TeamThreadRange(member, blocksize),
3936 [&](
const local_ordinal_type &i) {
3937 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
3938 for (local_ordinal_type col=0;col<num_vectors;++col) {
3939 impl_scalar_type &y = Y_scalar_multivector(row,col);
3940 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
3944 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
3945 z_partial_sum += yd_abs*yd_abs;
3953 Kokkos::parallel_for
3954 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
3955 [&](
const local_ordinal_type &j) {
3956 const local_ordinal_type pri = pri0 + j;
3957 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
3958 const local_ordinal_type ri0 = ri0_vals[vi];
3959 const local_ordinal_type nrows = nrows_vals[vi];
3961 for (local_ordinal_type col=0;col<num_vectors;++col) {
3962 for (local_ordinal_type i=0;i<blocksize;++i) {
3963 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
3964 impl_scalar_type &y = Y_scalar_multivector(row,col);
3965 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
3969 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
3970 z_partial_sum += yd_abs*yd_abs;
3979 Z_scalar_vector(member.league_rank()) += z_partial_sum;
3986 template<
typename WWViewType>
3987 KOKKOS_INLINE_FUNCTION
3989 solveSingleVector(
const member_type &member,
3990 const local_ordinal_type &blocksize,
3991 const local_ordinal_type &i0,
3992 const local_ordinal_type &r0,
3993 const local_ordinal_type &nrows,
3994 const local_ordinal_type &v,
3995 const WWViewType &WW)
const {
3997 typedef SolveTridiagsDefaultModeAndAlgo
3998 <
typename execution_space::memory_space> default_mode_and_algo_type;
4000 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4001 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4004 auto A = D_internal_vector_values.data();
4005 auto X = X_internal_vector_values.data();
4008 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4009 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4013 const local_ordinal_type astep = D_internal_vector_values.stride_0();
4014 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
4015 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
4016 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4017 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
4026 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4027 (default_mode_type,default_algo_type,
4030 blocksize,blocksize,
4035 for (local_ordinal_type tr=1;tr<nrows;++tr) {
4036 member.team_barrier();
4037 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4038 (default_mode_type,default_algo_type,
4040 blocksize, blocksize,
4042 A+2*astep, as0, as1,
4046 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4047 (default_mode_type,default_algo_type,
4050 blocksize,blocksize,
4052 A+3*astep, as0, as1,
4060 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4061 (default_mode_type,default_algo_type,
4064 blocksize, blocksize,
4069 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4071 member.team_barrier();
4072 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4073 (default_mode_type,default_algo_type,
4075 blocksize, blocksize,
4077 A+1*astep, as0, as1,
4081 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4082 (default_mode_type,default_algo_type,
4085 blocksize, blocksize,
4094 const local_ordinal_type ws0 = WW.stride_0();
4095 auto W = WW.data() + v;
4096 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
4098 member, blocksize, X, xs0, W, ws0);
4099 member.team_barrier();
4100 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4101 (default_mode_type,default_algo_type,
4103 blocksize, blocksize,
4112 template<
typename WWViewType>
4113 KOKKOS_INLINE_FUNCTION
4115 solveMultiVector(
const member_type &member,
4116 const local_ordinal_type &,
4117 const local_ordinal_type &i0,
4118 const local_ordinal_type &r0,
4119 const local_ordinal_type &nrows,
4120 const local_ordinal_type &v,
4121 const WWViewType &WW)
const {
4123 typedef SolveTridiagsDefaultModeAndAlgo
4124 <
typename execution_space::memory_space> default_mode_and_algo_type;
4126 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4127 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4130 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4131 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4134 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4135 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4138 local_ordinal_type i = i0, r = r0;
4143 KB::Trsm<member_type,
4144 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4145 default_mode_type,default_algo_type>
4146 ::invoke(member, one, A, X1);
4147 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
4148 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
4149 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
4150 member.team_barrier();
4151 KB::Gemm<member_type,
4152 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4153 default_mode_type,default_algo_type>
4154 ::invoke(member, -one, A, X1, one, X2);
4155 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
4156 KB::Trsm<member_type,
4157 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4158 default_mode_type,default_algo_type>
4159 ::invoke(member, one, A, X2);
4160 X1.assign_data( X2.data() );
4164 KB::Trsm<member_type,
4165 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4166 default_mode_type,default_algo_type>
4167 ::invoke(member, one, A, X1);
4168 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4170 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
4171 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
4172 member.team_barrier();
4173 KB::Gemm<member_type,
4174 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4175 default_mode_type,default_algo_type>
4176 ::invoke(member, -one, A, X1, one, X2);
4178 A.assign_data( &D_internal_vector_values(i,0,0,v) );
4179 KB::Trsm<member_type,
4180 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4181 default_mode_type,default_algo_type>
4182 ::invoke(member, one, A, X2);
4183 X1.assign_data( X2.data() );
4187 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4188 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
4189 ::invoke(member, X1, W);
4190 member.team_barrier();
4191 KB::Gemm<member_type,
4192 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4193 default_mode_type,default_algo_type>
4194 ::invoke(member, one, A, W, zero, X1);
4198 template<
int B>
struct SingleVectorTag {};
4199 template<
int B>
struct MultiVectorTag {};
4201 template<
int B>
struct SingleVectorSubLineTag {};
4202 template<
int B>
struct MultiVectorSubLineTag {};
4203 template<
int B>
struct SingleVectorApplyCTag {};
4204 template<
int B>
struct MultiVectorApplyCTag {};
4205 template<
int B>
struct SingleVectorSchurTag {};
4206 template<
int B>
struct MultiVectorSchurTag {};
4207 template<
int B>
struct SingleVectorApplyETag {};
4208 template<
int B>
struct MultiVectorApplyETag {};
4209 template<
int B>
struct SingleVectorCopyToFlatTag {};
4210 template<
int B>
struct SingleZeroingTag {};
4213 KOKKOS_INLINE_FUNCTION
4215 operator() (
const SingleVectorTag<B> &,
const member_type &member)
const {
4216 const local_ordinal_type packidx = member.league_rank();
4217 const local_ordinal_type partidx = packptr(packidx);
4218 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4219 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4220 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4221 const local_ordinal_type r0 = part2packrowidx0(partidx);
4222 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4223 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4224 const local_ordinal_type num_vectors = 1;
4225 internal_vector_scratch_type_3d_view
4226 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4227 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4228 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4230 Kokkos::parallel_for
4231 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4232 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4233 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4238 KOKKOS_INLINE_FUNCTION
4240 operator() (
const MultiVectorTag<B> &,
const member_type &member)
const {
4241 const local_ordinal_type packidx = member.league_rank();
4242 const local_ordinal_type partidx = packptr(packidx);
4243 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4244 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4245 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4246 const local_ordinal_type r0 = part2packrowidx0(partidx);
4247 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4248 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4249 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4251 internal_vector_scratch_type_3d_view
4252 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4253 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4254 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4256 Kokkos::parallel_for
4257 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4258 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4259 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4264 KOKKOS_INLINE_FUNCTION
4266 operator() (
const SingleVectorSubLineTag<B> &,
const member_type &member)
const {
4268 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4270 const local_ordinal_type subpartidx = packptr_sub(packidx);
4271 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4272 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4273 const local_ordinal_type partidx = subpartidx%n_parts;
4275 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4276 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4277 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4278 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4279 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4280 const local_ordinal_type num_vectors = blocksize;
4286 internal_vector_scratch_type_3d_view
4287 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4289 Kokkos::parallel_for
4290 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4291 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4296 KOKKOS_INLINE_FUNCTION
4298 operator() (
const SingleVectorApplyCTag<B> &,
const member_type &member)
const {
4301 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4303 const local_ordinal_type subpartidx = packptr_sub(packidx);
4304 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4305 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4306 const local_ordinal_type partidx = subpartidx%n_parts;
4307 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4310 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4311 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4312 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4314 internal_vector_scratch_type_3d_view
4315 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4319 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
4320 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
4321 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
4326 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4328 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
4329 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
4331 typedef SolveTridiagsDefaultModeAndAlgo
4332 <
typename execution_space::memory_space> default_mode_and_algo_type;
4334 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4335 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4337 if (local_subpartidx == 0) {
4338 Kokkos::parallel_for
4339 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4340 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4341 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4342 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4344 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4345 (default_mode_type,default_algo_type,
4347 blocksize, blocksize,
4349 C.data(), C.stride_0(), C.stride_1(),
4350 v_1.data(), v_1.stride_0(),
4352 v_2.data(), v_2.stride_0());
4355 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4356 Kokkos::parallel_for
4357 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4358 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4359 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4360 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4362 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4363 (default_mode_type,default_algo_type,
4365 blocksize, blocksize,
4367 C.data(), C.stride_0(), C.stride_1(),
4368 v_1.data(), v_1.stride_0(),
4370 v_2.data(), v_2.stride_0());
4374 Kokkos::parallel_for
4375 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4377 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4378 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4379 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4381 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4382 (default_mode_type,default_algo_type,
4384 blocksize, blocksize,
4386 C.data(), C.stride_0(), C.stride_1(),
4387 v_1.data(), v_1.stride_0(),
4389 v_2.data(), v_2.stride_0());
4392 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4393 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4394 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4396 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4397 (default_mode_type,default_algo_type,
4399 blocksize, blocksize,
4401 C.data(), C.stride_0(), C.stride_1(),
4402 v_1.data(), v_1.stride_0(),
4404 v_2.data(), v_2.stride_0());
4411 KOKKOS_INLINE_FUNCTION
4413 operator() (
const SingleVectorSchurTag<B> &,
const member_type &member)
const {
4414 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4416 const local_ordinal_type partidx = packptr_sub(packidx);
4418 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4420 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
4421 const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
4423 const local_ordinal_type r0_schur = nrows * member.league_rank();
4425 internal_vector_scratch_type_3d_view
4426 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4428 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4429 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4430 for (local_ordinal_type i = 0; i < 2; ++i) {
4431 copy3DView<local_ordinal_type>(member,
4432 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4433 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4437 Kokkos::parallel_for
4438 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4439 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4442 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4443 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4444 for (local_ordinal_type i = 0; i < 2; ++i) {
4445 copy3DView<local_ordinal_type>(member,
4446 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4447 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4453 KOKKOS_INLINE_FUNCTION
4455 operator() (
const SingleVectorApplyETag<B> &,
const member_type &member)
const {
4456 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4458 const local_ordinal_type subpartidx = packptr_sub(packidx);
4459 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4460 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4461 const local_ordinal_type partidx = subpartidx%n_parts;
4462 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4464 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4465 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4467 internal_vector_scratch_type_3d_view
4468 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4472 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4474 typedef SolveTridiagsDefaultModeAndAlgo
4475 <
typename execution_space::memory_space> default_mode_and_algo_type;
4477 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4478 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4480 if (local_subpartidx == 0) {
4481 Kokkos::parallel_for
4482 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4484 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4486 for (local_ordinal_type row = 0; row < nrows; ++row) {
4487 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4488 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4490 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4491 (default_mode_type,default_algo_type,
4493 blocksize, blocksize,
4495 E.data(), E.stride_0(), E.stride_1(),
4496 v_2.data(), v_2.stride_0(),
4498 v_1.data(), v_1.stride_0());
4502 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4503 Kokkos::parallel_for
4504 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4505 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4507 for (local_ordinal_type row = 0; row < nrows; ++row) {
4508 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4509 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4511 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4512 (default_mode_type,default_algo_type,
4514 blocksize, blocksize,
4516 E.data(), E.stride_0(), E.stride_1(),
4517 v_2.data(), v_2.stride_0(),
4519 v_1.data(), v_1.stride_0());
4524 Kokkos::parallel_for
4525 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4527 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4529 for (local_ordinal_type row = 0; row < nrows; ++row) {
4530 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4531 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4533 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4534 (default_mode_type,default_algo_type,
4536 blocksize, blocksize,
4538 E.data(), E.stride_0(), E.stride_1(),
4539 v_2.data(), v_2.stride_0(),
4541 v_1.data(), v_1.stride_0());
4545 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4547 for (local_ordinal_type row = 0; row < nrows; ++row) {
4548 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4549 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4551 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4552 (default_mode_type,default_algo_type,
4554 blocksize, blocksize,
4556 E.data(), E.stride_0(), E.stride_1(),
4557 v_2.data(), v_2.stride_0(),
4559 v_1.data(), v_1.stride_0());
4567 KOKKOS_INLINE_FUNCTION
4569 operator() (
const SingleVectorCopyToFlatTag<B> &,
const member_type &member)
const {
4570 const local_ordinal_type packidx = member.league_rank();
4571 const local_ordinal_type partidx = packptr(packidx);
4572 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4573 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4574 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4575 const local_ordinal_type num_vectors = 1;
4577 Kokkos::parallel_for
4578 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
4579 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4584 KOKKOS_INLINE_FUNCTION
4586 operator() (
const SingleZeroingTag<B> &,
const member_type &member)
const {
4587 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4588 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4592 void run(
const impl_scalar_type_2d_view_tpetra &Y,
4593 const impl_scalar_type_1d_view &Z) {
4594 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4595 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::SolveTridiags");
4598 this->Y_scalar_multivector = Y;
4599 this->Z_scalar_vector = Z;
4601 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4602 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4604 const local_ordinal_type team_size =
4605 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4606 recommended_team_size(blocksize, vector_length, internal_vector_length);
4607 const int per_team_scratch = internal_vector_scratch_type_3d_view
4608 ::shmem_size(blocksize, num_vectors, vector_loop_size);
4610 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4611 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4612 if (num_vectors == 1) { \
4613 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4614 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4615 Kokkos::parallel_for \
4616 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4617 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4619 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4620 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4621 Kokkos::parallel_for \
4622 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4623 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4626 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4627 if (num_vectors == 1) { \
4628 if (packindices_schur.extent(1) <= 0) { \
4629 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4630 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4631 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4632 Kokkos::parallel_for \
4633 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4639 Kokkos::TeamPolicy<execution_space,SingleZeroingTag<B> > \
4640 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4641 Kokkos::parallel_for \
4642 ("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4646 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag"); \
4647 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4648 Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> > \
4649 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4650 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4651 Kokkos::parallel_for \
4652 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4654 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4655 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4658 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag"); \
4659 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4660 Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> > \
4661 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4662 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4663 Kokkos::parallel_for \
4664 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4666 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4667 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4670 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag"); \
4671 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4672 Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> > \
4673 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4674 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4675 Kokkos::parallel_for \
4676 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4678 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4679 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4682 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag"); \
4683 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4684 Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> > \
4685 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4686 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4687 Kokkos::parallel_for \
4688 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4690 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4691 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4695 Kokkos::TeamPolicy<execution_space,SingleVectorCopyToFlatTag<B> > \
4696 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4697 Kokkos::parallel_for \
4698 ("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4703 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4704 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4705 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4706 Kokkos::parallel_for \
4707 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4711 switch (blocksize) {
4712 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
4713 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
4714 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 6);
4715 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
4716 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4717 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4718 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4719 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4720 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4721 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4722 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4723 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4724 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
4726 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4728 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4729 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4736 template<
typename MatrixType>
4739 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
4740 const Teuchos::RCP<
const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4741 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
4742 const bool overlap_communication_and_computation,
4744 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,
4745 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,
4746 typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z,
4747 typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,
4749 const BlockHelperDetails::PartInterface<MatrixType> &interf,
4752 typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work,
4757 const int max_num_sweeps,
4758 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4759 const int check_tol_every) {
4760 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ApplyInverseJacobi");
4763 using node_memory_space =
typename impl_type::node_memory_space;
4764 using local_ordinal_type =
typename impl_type::local_ordinal_type;
4765 using size_type =
typename impl_type::size_type;
4766 using impl_scalar_type =
typename impl_type::impl_scalar_type;
4767 using magnitude_type =
typename impl_type::magnitude_type;
4768 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
4769 using vector_type_1d_view =
typename impl_type::vector_type_1d_view;
4770 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
4771 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
4773 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
4777 "Neither Tpetra importer nor Async importer is null.");
4780 "Maximum number of sweeps must be >= 1.");
4783 const bool is_seq_method_requested = !tpetra_importer.is_null();
4784 const bool is_async_importer_active = !async_importer.is_null();
4785 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4786 const magnitude_type tolerance = tol*tol;
4787 const local_ordinal_type blocksize = btdm.values.extent(1);
4788 const local_ordinal_type num_vectors = Y.getNumVectors();
4789 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4791 const impl_scalar_type zero(0.0);
4793 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4794 "The seq method for applyInverseJacobi, " <<
4795 "which in any case is for developer use only, " <<
4796 "does not support norm-based termination.");
4797 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4798 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4800 std::invalid_argument,
4801 "The seq method for applyInverseJacobi, " <<
4802 "which in any case is for developer use only, " <<
4803 "only supports memory spaces accessible from host.");
4806 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4807 if (work.span() < work_span_required)
4808 work = vector_type_1d_view(
"vector workspace 1d view", work_span_required);
4811 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4812 if (local_ordinal_type(W.extent(0)) < W_size)
4813 W = impl_scalar_type_1d_view(
"W", W_size);
4815 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4817 if (is_seq_method_requested) {
4818 if (Z.getNumVectors() != Y.getNumVectors())
4819 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors,
false);
4821 if (is_async_importer_active) {
4823 async_importer->createDataBuffer(num_vectors);
4824 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4830 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4831 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4832 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4833 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4834 if (is_y_zero) Kokkos::deep_copy(YY, zero);
4837 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4838 damping_factor, is_norm_manager_active);
4840 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4841 BlockHelperDetails::ComputeResidualVector<MatrixType>
4842 compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
4843 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
4846 if (is_norm_manager_active)
4847 norm_manager.setCheckFrequency(check_tol_every);
4851 for (;sweep<max_num_sweeps;++sweep) {
4855 multivector_converter.run(XX);
4857 if (is_seq_method_requested) {
4861 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
4862 compute_residual_vector.run(YY, XX, ZZ);
4865 multivector_converter.run(YY);
4869 if (overlap_communication_and_computation || !is_async_importer_active) {
4870 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
4871 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
true);
4872 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
4873 if (is_async_importer_active) async_importer->cancel();
4876 if (is_async_importer_active) {
4877 async_importer->syncRecv();
4878 compute_residual_vector.run(pmv, XX, YY, remote_multivector,
false);
4881 if (is_async_importer_active)
4882 async_importer->syncExchange(YY);
4883 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance))
break;
4884 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
4892 solve_tridiags.run(YY, W);
4895 if (is_norm_manager_active) {
4897 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
4898 if (sweep + 1 == max_num_sweeps) {
4899 norm_manager.ireduce(sweep,
true);
4900 norm_manager.checkDone(sweep + 1, tolerance,
true);
4902 norm_manager.ireduce(sweep);
4910 if (is_norm_manager_active) norm_manager.finalize();
4916 template<
typename MatrixType>
4919 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
4920 using block_tridiags_type = BlockTridiags<MatrixType>;
4923 using async_import_type = AsyncableImport<MatrixType>;
4929 bool overlap_communication_and_computation;
4932 mutable typename impl_type::tpetra_multivector_type Z;
4933 mutable typename impl_type::impl_scalar_type_1d_view W;
4936 part_interface_type part_interface;
4937 block_tridiags_type block_tridiags;
4939 mutable typename impl_type::vector_type_1d_view work;
4940 mutable norm_manager_type norm_manager;
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:173
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1007
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:4738
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:294
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1578
Definition: Ifpack2_BlockHelper.hpp:388
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:129
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3481
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1820
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:303
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:886
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:196
Definition: Ifpack2_BlockHelper.hpp:229
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2136
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:359
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockHelper.hpp:256
Definition: Ifpack2_BlockHelper.hpp:290
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1523
Definition: Ifpack2_BlockComputeResidualVector.hpp:56
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3495