43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
48 #include <Tpetra_Distributor.hpp>
49 #include <Tpetra_BlockMultiVector.hpp>
51 #include <Kokkos_ArithTraits.hpp>
52 #include <KokkosBatched_Util.hpp>
53 #include <KokkosBatched_Vector.hpp>
54 #include <KokkosBatched_Copy_Decl.hpp>
55 #include <KokkosBatched_Copy_Impl.hpp>
56 #include <KokkosBatched_AddRadial_Decl.hpp>
57 #include <KokkosBatched_AddRadial_Impl.hpp>
58 #include <KokkosBatched_SetIdentity_Decl.hpp>
59 #include <KokkosBatched_SetIdentity_Impl.hpp>
60 #include <KokkosBatched_Gemm_Decl.hpp>
61 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
62 #include <KokkosBatched_Gemm_Team_Impl.hpp>
63 #include <KokkosBatched_Gemv_Decl.hpp>
64 #include <KokkosBatched_Gemv_Serial_Impl.hpp>
65 #include <KokkosBatched_Gemv_Team_Impl.hpp>
66 #include <KokkosBatched_Trsm_Decl.hpp>
67 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
68 #include <KokkosBatched_Trsm_Team_Impl.hpp>
69 #include <KokkosBatched_Trsv_Decl.hpp>
70 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
71 #include <KokkosBatched_Trsv_Team_Impl.hpp>
72 #include <KokkosBatched_LU_Decl.hpp>
73 #include <KokkosBatched_LU_Serial_Impl.hpp>
74 #include <KokkosBatched_LU_Team_Impl.hpp>
76 #include <KokkosBlas1_nrm1.hpp>
77 #include <KokkosBlas1_nrm2.hpp>
82 #define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
84 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
85 #include "cuda_profiler_api.h"
90 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
95 namespace BlockTriDiContainerDetails {
102 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
103 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::Unmanaged |
104 MemoryTraitsType::RandomAccess |
107 template <
typename ViewType>
108 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
109 typename ViewType::array_layout,
110 typename ViewType::device_type,
111 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
112 template <
typename ViewType>
113 using Atomic = Kokkos::View<
typename ViewType::data_type,
114 typename ViewType::array_layout,
115 typename ViewType::device_type,
116 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
117 template <
typename ViewType>
118 using Const = Kokkos::View<
typename ViewType::const_data_type,
119 typename ViewType::array_layout,
120 typename ViewType::device_type,
121 typename ViewType::memory_traits>;
122 template <
typename ViewType>
123 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
125 template <
typename ViewType>
126 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
128 template <
typename ViewType>
129 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
130 typename ViewType::array_layout,
131 typename ViewType::device_type,
132 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
135 template <
typename ViewType>
136 using Scratch = Kokkos::View<
typename ViewType::data_type,
137 typename ViewType::array_layout,
138 typename ViewType::execution_space::scratch_memory_space,
139 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
144 template<
typename T>
struct is_cuda {
enum :
bool { value =
false }; };
145 #if defined(KOKKOS_ENABLE_CUDA)
146 template<>
struct is_cuda<Kokkos::Cuda> {
enum :
bool { value =
true }; };
152 template<
typename CommPtrType>
154 const auto rank = comm->getRank();
155 const auto nranks = comm->getSize();
156 std::stringstream ss;
157 ss <<
"Rank " << rank <<
" of " << nranks <<
": ";
164 template<
typename T,
int N>
167 KOKKOS_INLINE_FUNCTION
169 for (
int i=0;i<N;++i)
172 KOKKOS_INLINE_FUNCTION
174 for (
int i=0;i<N;++i)
178 template<
typename T,
int N>
180 KOKKOS_INLINE_FUNCTION
184 for (
int i=0;i<N;++i)
187 template<
typename T,
int N>
189 KOKKOS_INLINE_FUNCTION
191 operator+=(ArrayValueType<T,N> &a,
192 const ArrayValueType<T,N> &b) {
193 for (
int i=0;i<N;++i)
200 template<
typename T,
int N,
typename ExecSpace>
204 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
207 KOKKOS_INLINE_FUNCTION
210 KOKKOS_INLINE_FUNCTION
212 for (
int i=0;i<N;++i)
213 dst.v[i] += src.v[i];
215 KOKKOS_INLINE_FUNCTION
217 for (
int i=0;i<N;++i)
218 dst.v[i] += src.v[i];
220 KOKKOS_INLINE_FUNCTION
222 for (
int i=0;i<N;++i)
223 val.v[i] = Kokkos::reduction_identity<T>::sum();
225 KOKKOS_INLINE_FUNCTION
229 KOKKOS_INLINE_FUNCTION
230 result_view_type view()
const {
231 return result_view_type(value);
235 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
236 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
238 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
241 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
242 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(label) \
243 cudaProfilerStart(); \
244 IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
245 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
246 { cudaProfilerStop(); }
248 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(label) \
250 IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
251 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
257 template <
typename MatrixType>
263 typedef typename MatrixType::scalar_type scalar_type;
264 typedef typename MatrixType::local_ordinal_type local_ordinal_type;
265 typedef typename MatrixType::global_ordinal_type global_ordinal_type;
266 typedef typename MatrixType::node_type node_type;
272 typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
283 typedef typename device_type::execution_space execution_space;
284 typedef typename device_type::memory_space memory_space;
285 typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
286 typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
287 typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
288 typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
289 typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
290 typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
291 typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
292 typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_type local_crs_graph_type;
297 template<
typename T,
int l>
using Vector = KokkosBatched::Experimental::Vector<T,l>;
298 template<
typename T>
using SIMD = KokkosBatched::Experimental::SIMD<T>;
299 template<
typename T,
typename M>
using DefaultVectorLength = KokkosBatched::Experimental::DefaultVectorLength<T,M>;
300 template<
typename T,
typename M>
using DefaultInternalVectorLength = KokkosBatched::Experimental::DefaultInternalVectorLength<T,M>;
302 static constexpr
int vector_length = DefaultVectorLength<impl_scalar_type,memory_space>::value;
303 static constexpr
int internal_vector_length = DefaultInternalVectorLength<impl_scalar_type,memory_space>::value;
311 typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
313 typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
315 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
317 typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
318 typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
319 typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
320 typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
321 typedef Kokkos::View<impl_scalar_type***,Kokkos::LayoutRight,device_type> impl_scalar_type_3d_view;
322 typedef Kokkos::View<impl_scalar_type****,Kokkos::LayoutRight,device_type> impl_scalar_type_4d_view;
328 template<
typename MatrixType>
331 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter");
333 using tpetra_map_type =
typename impl_type::tpetra_map_type;
334 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
335 using tpetra_import_type =
typename impl_type::tpetra_import_type;
337 const auto g = A->getCrsGraph();
338 const auto blocksize = A->getBlockSize();
339 const auto src =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
340 const auto tgt =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
350 template<
typename MatrixType>
351 struct AsyncableImport {
353 using impl_type = ImplType<MatrixType>;
359 #if !defined(HAVE_IFPACK2_MPI)
360 typedef int MPI_Request;
361 typedef int MPI_Comm;
363 using scalar_type =
typename impl_type::scalar_type;
367 template <
typename T>
368 static int isend(
const MPI_Comm comm,
const T* buf,
int count,
int dest,
int tag, MPI_Request* ireq) {
369 #ifdef HAVE_IFPACK2_MPI
371 const auto dt = Teuchos::Details::MpiTypeTraits<scalar_type>::getType();
372 int ret = MPI_Isend(const_cast<T*>(buf), count, dt, dest, tag, comm, ireq == NULL ? &ureq : ireq);
373 if (ireq == NULL) MPI_Request_free(&ureq);
380 template <
typename T>
381 static int irecv(
const MPI_Comm comm, T* buf,
int count,
int src,
int tag, MPI_Request* ireq) {
382 #ifdef HAVE_IFPACK2_MPI
384 const auto dt = Teuchos::Details::MpiTypeTraits<scalar_type>::getType();
385 int ret = MPI_Irecv(buf, count, dt, src, tag, comm, ireq == NULL ? &ureq : ireq);
386 if (ireq == NULL) MPI_Request_free(&ureq);
393 static int waitany(
int count, MPI_Request* reqs,
int* index) {
394 #ifdef HAVE_IFPACK2_MPI
395 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
401 static int waitall(
int count, MPI_Request* reqs) {
402 #ifdef HAVE_IFPACK2_MPI
403 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
410 using tpetra_map_type =
typename impl_type::tpetra_map_type;
411 using tpetra_import_type =
typename impl_type::tpetra_import_type;
413 using local_ordinal_type =
typename impl_type::local_ordinal_type;
414 using global_ordinal_type =
typename impl_type::global_ordinal_type;
419 using int_1d_view_host = Kokkos::View<int*,host_execution_space>;
420 using local_ordinal_type_1d_view_host =
typename impl_type::local_ordinal_type_1d_view::HostMirror;
422 using execution_space =
typename impl_type::execution_space;
423 using memory_space =
typename impl_type::memory_space;
424 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
426 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
427 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
429 #ifdef HAVE_IFPACK2_MPI
432 impl_scalar_type_2d_view remote_multivector;
433 local_ordinal_type blocksize;
436 struct SendRecvPair {
441 SendRecvPair<int_1d_view_host> pids;
442 SendRecvPair<std::vector<MPI_Request> > reqs;
443 SendRecvPair<size_type_1d_view> offset;
444 SendRecvPair<local_ordinal_type_1d_view> lids;
445 SendRecvPair<impl_scalar_type_1d_view> buffer;
447 local_ordinal_type_1d_view dm2cm;
452 const size_type_1d_view &offs) {
454 Kokkos::View<size_t*,host_execution_space> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
455 const auto lens_device = Kokkos::create_mirror_view(memory_space(), lens_host);
456 Kokkos::deep_copy(lens_device, lens_host);
459 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
460 const local_ordinal_type lens_size = lens_device.extent(0);
461 Kokkos::parallel_scan
462 (
"AsyncableImport::RangePolicy::setOffsetValues",
463 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
466 update += (i < lens_size ? lens_device[i] : 0);
471 void createMpiRequests(
const tpetra_import_type &
import) {
472 Tpetra::Distributor &distributor =
import.getDistributor();
475 const auto pids_from = distributor.getProcsFrom();
477 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(int)*pids.recv.extent(0));
479 const auto pids_to = distributor.getProcsTo();
481 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(int)*pids.send.extent(0));
484 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*
sizeof(MPI_Request));
485 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*
sizeof(MPI_Request));
488 const auto lengths_to = distributor.getLengthsTo();
491 const auto lengths_from = distributor.getLengthsFrom();
494 setOffsetValues(lengths_to, offset.send);
495 setOffsetValues(lengths_from, offset.recv);
498 void createSendRecvIDs(
const tpetra_import_type &
import) {
500 const auto remote_lids =
import.getRemoteLIDs();
501 const local_ordinal_type_1d_view_host
502 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
504 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
507 auto epids =
import.getExportPIDs();
508 auto elids =
import.getExportLIDs();
511 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
514 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
515 const auto pid_send_value = pids.send[i];
516 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
517 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
518 #if !defined(__CUDA_ARCH__)
522 Kokkos::deep_copy(lids.send, lids_send_host);
528 struct ToMultiVector {};
531 template<
typename PackTag>
533 void copy(
const local_ordinal_type_1d_view &lids_,
534 const impl_scalar_type_1d_view &buffer_,
535 const local_ordinal_type &ibeg_,
536 const local_ordinal_type &iend_,
537 const impl_scalar_type_2d_view &multivector_,
538 const local_ordinal_type blocksize_) {
539 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::Copy");
540 const local_ordinal_type num_vectors = multivector_.extent(1);
541 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
542 const local_ordinal_type idiff = iend_ - ibeg_;
543 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
544 if (is_cuda<execution_space>::value) {
545 #if defined(KOKKOS_ENABLE_CUDA)
546 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
547 local_ordinal_type vector_size(0);
548 if (blocksize_ <= 4) vector_size = 4;
549 else if (blocksize_ <= 8) vector_size = 8;
550 else if (blocksize_ <= 16) vector_size = 16;
551 else vector_size = 32;
552 const team_policy_type policy(idiff, 1, vector_size);
554 (
"AsyncableImport::TeamPolicy::copy",
555 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
556 const local_ordinal_type i = member.league_rank();
558 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
559 auto aptr = abase + blocksize_*(i + idiff*j);
560 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
561 if (std::is_same<PackTag,ToBuffer>::value)
563 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
568 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
575 #if defined(__CUDA_ARCH__)
578 if (num_vectors == 1) {
579 const Kokkos::RangePolicy<execution_space> policy(ibeg_, iend_);
581 (
"AsyncableImport::RangePolicy::copy",
582 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
583 auto aptr = buffer_.data() + blocksize_*i;
584 auto bptr = multivector_.data() + blocksize_*lids_(i);
585 if (std::is_same<PackTag,ToBuffer>::value)
586 memcpy(aptr, bptr,
sizeof(impl_scalar_type)*blocksize_);
588 memcpy(bptr, aptr,
sizeof(impl_scalar_type)*blocksize_);
592 Kokkos::MDRangePolicy
593 <execution_space, Kokkos::Rank<2>, Kokkos::IndexType<local_ordinal_type> >
594 policy( { 0, 0 }, { idiff, num_vectors } );
597 (
"AsyncableImport::MDRangePolicy::copy",
598 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i,
599 const local_ordinal_type &j) {
600 auto aptr = abase + blocksize_*(i + idiff*j);
601 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
602 if (std::is_same<PackTag,ToBuffer>::value)
603 for (local_ordinal_type k=0;k<blocksize_;++k) aptr[k] = bptr[k];
605 for (local_ordinal_type k=0;k<blocksize_;++k) bptr[k] = aptr[k];
613 void createDataBuffer(
const local_ordinal_type &num_vectors) {
614 const size_type extent_0 = lids.recv.extent(0)*blocksize;
615 const size_type extent_1 = num_vectors;
616 if (remote_multivector.extent(0) == extent_0 &&
617 remote_multivector.extent(1) == extent_1) {
623 const auto send_buffer_size = offset.send[offset.send.extent(0)-1]*blocksize*num_vectors;
626 const auto recv_buffer_size = offset.recv[offset.recv.extent(0)-1]*blocksize*num_vectors;
633 const local_ordinal_type blocksize_,
634 const local_ordinal_type_1d_view dm2cm_) {
635 blocksize = blocksize_;
638 #ifdef HAVE_IFPACK2_MPI
639 const auto mpi_comm = Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int> >(tgt_map->getComm());
641 comm = *mpi_comm->getRawMpiComm();
643 const tpetra_import_type
import(src_map, tgt_map);
645 createMpiRequests(
import);
646 createSendRecvIDs(
import);
649 void asyncSendRecv(
const impl_scalar_type_2d_view &mv) {
650 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv");
652 #ifdef HAVE_IFPACK2_MPI
654 const local_ordinal_type num_vectors = mv.extent(1);
655 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
658 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
660 buffer.recv.data() + offset.recv[i]*mv_blocksize,
661 (offset.recv[i+1] - offset.recv[i])*mv_blocksize,
668 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
669 copy<ToBuffer>(lids.send, buffer.send, offset.send(i), offset.send(i+1),
672 buffer.send.data() + offset.send[i]*mv_blocksize,
673 (offset.send[i+1] - offset.send[i])*mv_blocksize,
681 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
684 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
690 #ifdef HAVE_IFPACK2_MPI
691 waitall(reqs.recv.size(), reqs.recv.data());
692 waitall(reqs.send.size(), reqs.send.data());
697 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv");
698 #ifdef HAVE_IFPACK2_MPI
700 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
701 local_ordinal_type idx = i;
702 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
703 copy<ToMultiVector>(lids.recv, buffer.recv, offset.recv(idx), offset.recv(idx+1),
704 remote_multivector, blocksize);
707 waitall(reqs.send.size(), reqs.send.data());
711 void syncExchange(
const impl_scalar_type_2d_view &mv) {
712 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange");
717 impl_scalar_type_2d_view getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
723 template<
typename MatrixType>
727 using tpetra_map_type =
typename impl_type::tpetra_map_type;
728 using local_ordinal_type =
typename impl_type::local_ordinal_type;
729 using global_ordinal_type =
typename impl_type::global_ordinal_type;
730 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
732 const auto g = A->getCrsGraph();
733 const auto blocksize = A->getBlockSize();
734 const auto domain_map = g.getDomainMap();
735 const auto column_map = g.getColMap();
737 std::vector<global_ordinal_type> gids;
738 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
739 for (
size_t i=0;i<column_map->getNodeNumElements();++i) {
740 const global_ordinal_type gid = column_map->getGlobalElement(i);
741 if (!domain_map->isNodeGlobalElement(gid)) {
744 }
else if (found_first) {
745 separate_remotes =
false;
748 if (!need_owned_permutation &&
749 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
758 need_owned_permutation =
true;
762 if (separate_remotes) {
764 const auto parsimonious_col_map
765 =
Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
766 if (parsimonious_col_map->getGlobalNumElements() > 0) {
768 local_ordinal_type_1d_view dm2cm;
769 if (need_owned_permutation) {
770 dm2cm = local_ordinal_type_1d_view(
"dm2cm", domain_map->getNodeNumElements());
771 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
772 for (
size_t i=0;i<domain_map->getNodeNumElements();++i)
773 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
774 Kokkos::deep_copy(dm2cm, dm2cm_host);
776 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
779 return Teuchos::null;
782 template<
typename MatrixType>
783 struct PartInterface {
784 using local_ordinal_type =
typename ImplType<MatrixType>::local_ordinal_type;
785 using local_ordinal_type_1d_view =
typename ImplType<MatrixType>::local_ordinal_type_1d_view;
787 PartInterface() =
default;
788 PartInterface(
const PartInterface &b) =
default;
808 local_ordinal_type_1d_view lclrow;
810 local_ordinal_type_1d_view partptr;
813 local_ordinal_type_1d_view packptr;
816 local_ordinal_type_1d_view part2rowidx0;
820 local_ordinal_type_1d_view part2packrowidx0;
821 local_ordinal_type part2packrowidx0_back;
823 local_ordinal_type_1d_view rowidx2part;
831 local_ordinal_type max_partsz;
837 template<
typename MatrixType>
838 PartInterface<MatrixType>
842 using local_ordinal_type =
typename impl_type::local_ordinal_type;
843 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
845 constexpr
int vector_length = impl_type::vector_length;
847 const auto comm = A->getRowMap()->getComm();
849 PartInterface<MatrixType> interf;
851 const bool jacobi = partitions.size() == 0;
852 const local_ordinal_type A_n_lclrows = A->getNodeNumRows();
853 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
855 #if defined(BLOCKTRIDICONTAINER_DEBUG)
856 local_ordinal_type nrows = 0;
860 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
863 (nrows != A_n_lclrows,
get_msg_prefix(comm) <<
"The #rows implied by the local partition is not "
864 <<
"the same as getNodeNumRows: " << nrows <<
" vs " << A_n_lclrows);
868 std::vector<local_ordinal_type> p;
870 interf.max_partsz = 1;
875 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
876 std::vector<size_idx_pair_type> partsz(nparts);
877 for (local_ordinal_type i=0;i<nparts;++i)
878 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
879 std::sort(partsz.begin(), partsz.end(),
880 [] (
const size_idx_pair_type& x,
const size_idx_pair_type& y) {
881 return x.first > y.first;
883 for (local_ordinal_type i=0;i<nparts;++i)
884 p[i] = partsz[i].second;
886 interf.max_partsz = partsz[0].first;
893 interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
897 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
898 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
899 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
900 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
901 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
904 interf.row_contiguous =
true;
907 part2packrowidx0(0) = 0;
908 local_ordinal_type pack_nrows = 0;
909 for (local_ordinal_type ip=0;ip<nparts;++ip) {
910 const auto* part = jacobi ? NULL : &partitions[p[ip]];
911 const local_ordinal_type ipnrows = jacobi ? 1 : part->size();
912 TEUCHOS_ASSERT(ip == 0 || (jacobi || ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
915 <<
"partition " << p[ip]
916 <<
" is empty, which is not allowed.");
918 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
921 if (ip % vector_length == 0) pack_nrows = ipnrows;
922 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
923 const local_ordinal_type os = partptr(ip);
924 for (local_ordinal_type i=0;i<ipnrows;++i) {
925 const auto lcl_row = jacobi ? ip : (*part)[i];
928 <<
"partitions[" << p[ip] <<
"]["
929 << i <<
"] = " << lcl_row
930 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
932 lclrow(os+i) = lcl_row;
933 rowidx2part(os+i) = ip;
934 if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
935 interf.row_contiguous =
false;
937 partptr(ip+1) = os + ipnrows;
939 #if defined(BLOCKTRIDICONTAINER_DEBUG)
942 if (lclrow(0) != 0) interf.row_contiguous =
false;
944 Kokkos::deep_copy(interf.partptr, partptr);
945 Kokkos::deep_copy(interf.lclrow, lclrow);
948 interf.part2rowidx0 = interf.partptr;
949 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
951 interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
952 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
955 local_ordinal_type npacks = 0;
956 for (local_ordinal_type ip=1;ip<=nparts;++ip)
957 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
959 interf.packptr = local_ordinal_type_1d_view(
"packptr", npacks + 1);
960 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
962 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
963 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
965 Kokkos::deep_copy(interf.packptr, packptr);
974 template <
typename MatrixType>
977 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
979 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
985 size_type_1d_view flat_td_ptr, pack_td_ptr;
988 local_ordinal_type_1d_view A_colindsub;
991 vector_type_3d_view values;
993 bool is_diagonal_only;
999 template <
typename idx_type>
1000 static KOKKOS_FORCEINLINE_FUNCTION
1001 idx_type IndexToRow (
const idx_type& ind) {
return (ind + 1) / 3; }
1004 template <
typename idx_type>
1005 static KOKKOS_FORCEINLINE_FUNCTION
1006 idx_type RowToIndex (
const idx_type& row) {
return row > 0 ? 3*row - 1 : 0; }
1008 template <
typename idx_type>
1009 static KOKKOS_FORCEINLINE_FUNCTION
1010 idx_type NumBlocks (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows - 2 : 0; }
1017 template<
typename MatrixType>
1021 using execution_space =
typename impl_type::execution_space;
1022 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1023 using size_type =
typename impl_type::size_type;
1024 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1026 constexpr
int vector_length = impl_type::vector_length;
1030 const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
1034 const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
1035 Kokkos::parallel_scan
1036 (
"createBlockTridiags::RangePolicy::flat_td_ptr",
1037 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1039 btdm.flat_td_ptr(i) = update;
1040 if (i < ntridiags) {
1041 const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
1042 update += btdm.NumBlocks(nrows);
1046 const auto nblocks = Kokkos::create_mirror_view_and_copy
1047 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
1048 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
1052 if (vector_length == 1) {
1053 btdm.pack_td_ptr = btdm.flat_td_ptr;
1055 const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
1057 const Kokkos::RangePolicy<execution_space> policy(0,npacks);
1058 Kokkos::parallel_scan
1059 (
"createBlockTridiags::RangePolicy::pack_td_ptr",
1060 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1061 const local_ordinal_type parti = interf.packptr(i);
1062 const local_ordinal_type parti_next = interf.packptr(i+1);
1064 const size_type nblks = update;
1065 for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1066 btdm.pack_td_ptr(pti) = nblks;
1067 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1070 btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1073 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1074 update += btdm.NumBlocks(nrows);
1094 template<
typename MatrixType>
1096 setTridiagsToIdentity
1097 (
const BlockTridiags<MatrixType>& btdm,
1098 const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1100 using impl_type = ImplType<MatrixType>;
1101 using execution_space =
typename impl_type::execution_space;
1102 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1103 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1105 const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1106 const local_ordinal_type blocksize = btdm.values.extent(1);
1109 const int vector_length = impl_type::vector_length;
1110 const int internal_vector_length = impl_type::internal_vector_length;
1112 using internal_vector_type =
typename impl_type::internal_vector_type;
1113 using internal_vector_type_4d_view =
1114 typename impl_type::internal_vector_type_4d_view;
1115 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1116 const internal_vector_type_4d_view values
1117 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1118 btdm.values.extent(0),
1119 btdm.values.extent(1),
1120 btdm.values.extent(2),
1121 vector_length/internal_vector_length);
1122 const local_ordinal_type vector_loop_size = values.extent(3);
1123 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1124 local_ordinal_type total_team_size(0);
1125 if (blocksize <= 5) total_team_size = 32;
1126 else if (blocksize <= 9) total_team_size = 64;
1127 else if (blocksize <= 12) total_team_size = 96;
1128 else if (blocksize <= 16) total_team_size = 128;
1129 else if (blocksize <= 20) total_team_size = 160;
1130 else total_team_size = 160;
1131 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1132 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1133 #else // Host architecture: team size is always one
1134 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1136 Kokkos::parallel_for
1137 (
"setTridiagsToIdentity::TeamPolicy",
1138 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1139 const local_ordinal_type k = member.league_rank();
1140 const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1141 const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1142 const local_ordinal_type diff = iend - ibeg;
1143 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1144 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
1145 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](
const local_ordinal_type &ii) {
1146 const local_ordinal_type i = ibeg + ii*3;
1147 for (local_ordinal_type j=0;j<blocksize;++j)
1148 values(i,j,j,v) = 1;
1159 template <
typename MatrixType>
1162 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1164 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
1167 size_type_1d_view rowptr, rowptr_remote;
1174 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1177 bool is_tpetra_block_crs;
1180 impl_scalar_type_1d_view tpetra_values;
1183 AmD(
const AmD &b) =
default;
1189 template<
typename MatrixType>
1192 const PartInterface<MatrixType> &interf,
1195 const bool overlap_communication_and_computation) {
1196 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::SymbolicPhase");
1199 using memory_space =
typename impl_type::memory_space;
1200 using host_execution_space =
typename impl_type::host_execution_space;
1202 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1203 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1204 using size_type =
typename impl_type::size_type;
1205 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1206 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1207 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1208 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1210 constexpr
int vector_length = impl_type::vector_length;
1212 const auto comm = A->getRowMap()->getComm();
1213 const auto& g = A->getCrsGraph();
1214 const auto blocksize = A->getBlockSize();
1217 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1218 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1219 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1220 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1221 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1223 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1226 Kokkos::View<local_ordinal_type*,host_execution_space> col2row(
"col2row", A->getNodeNumCols());
1229 const auto rowmap = g.getRowMap();
1230 const auto colmap = g.getColMap();
1231 const auto dommap = g.getDomainMap();
1232 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1234 #if !defined(__CUDA_ARCH__)
1235 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1236 Kokkos::parallel_for
1237 (
"performSymbolicPhase::RangePolicy::col2row",
1238 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1239 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1241 if (dommap->isNodeGlobalElement(gid)) {
1242 const local_ordinal_type lc = colmap->getLocalElement(gid);
1243 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1246 <<
" gives an invalid local column.");
1256 const auto& local_graph = g.getLocalGraph();
1257 const auto& local_graph_rowptr = local_graph.row_map;
1258 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
1259 const auto& local_graph_colidx = local_graph.entries;
1263 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
1265 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1266 Kokkos::parallel_for
1267 (
"performSymbolicPhase::RangePolicy::lclrow2idx",
1268 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1269 lclrow2idx[lclrow(i)] = i;
1275 typename sum_reducer_type::value_type sum_reducer_value;
1277 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1278 Kokkos::parallel_reduce
1281 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
typename sum_reducer_type::value_type &update) {
1283 const local_ordinal_type ri0 = lclrow2idx[lr];
1284 const local_ordinal_type pi0 = rowidx2part(ri0);
1285 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1286 const local_ordinal_type lc = local_graph_colidx(j);
1287 const local_ordinal_type lc2r = col2row[lc];
1288 bool incr_R =
false;
1294 const local_ordinal_type ri = lclrow2idx[lc2r];
1295 const local_ordinal_type pi = rowidx2part(ri);
1303 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1309 if (lc < nrows) ++update.v[1];
1313 }, sum_reducer_type(sum_reducer_value));
1315 size_type D_nnz = sum_reducer_value.v[0];
1316 size_type R_nnz_owned = sum_reducer_value.v[1];
1317 size_type R_nnz_remote = sum_reducer_value.v[2];
1319 if (!overlap_communication_and_computation) {
1320 R_nnz_owned += R_nnz_remote;
1326 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1328 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
1329 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1331 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1335 const local_ordinal_type nparts = partptr.extent(0) - 1;
1337 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1338 Kokkos::parallel_for
1339 (
"performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1340 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
1341 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1342 local_ordinal_type offset = 0;
1343 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1344 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1346 const local_ordinal_type lr0 = lclrow(ri0);
1347 const size_type j0 = local_graph_rowptr(lr0);
1348 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1349 const local_ordinal_type lc = local_graph_colidx(j);
1350 const local_ordinal_type lc2r = col2row[lc];
1352 const local_ordinal_type ri = lclrow2idx[lc2r];
1353 const local_ordinal_type pi = rowidx2part(ri);
1354 if (pi != pi0)
continue;
1355 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
1356 const local_ordinal_type row_entry = j - j0;
1357 D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1362 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1363 for (
size_t i=0;i<D_A_colindsub.extent(0);++i)
1366 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1370 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1371 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1372 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
1373 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1379 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
1380 amd.A_colindsub = local_ordinal_type_1d_view(Kokkos::ViewAllocateWithoutInitializing(
"amd.A_colindsub"), R_nnz_owned);
1382 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1383 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1385 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1386 amd.A_colindsub_remote = local_ordinal_type_1d_view(Kokkos::ViewAllocateWithoutInitializing(
"amd.A_colindsub_remote"), R_nnz_remote);
1388 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1389 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1392 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1393 Kokkos::parallel_for
1394 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1395 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1396 const local_ordinal_type ri0 = lclrow2idx[lr];
1397 const local_ordinal_type pi0 = rowidx2part(ri0);
1398 const size_type j0 = local_graph_rowptr(lr);
1399 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1400 const local_ordinal_type lc = local_graph_colidx(j);
1401 const local_ordinal_type lc2r = col2row[lc];
1403 const local_ordinal_type ri = lclrow2idx[lc2r];
1404 const local_ordinal_type pi = rowidx2part(ri);
1405 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1410 if (!overlap_communication_and_computation || lc < nrows) {
1413 ++R_rowptr_remote(lr);
1422 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1423 Kokkos::parallel_scan
1424 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1425 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
1426 update_type &update,
1427 const bool &
final) {
1429 val.v[0] = R_rowptr(lr);
1430 if (overlap_communication_and_computation)
1431 val.v[1] = R_rowptr_remote(lr);
1434 R_rowptr(lr) = update.v[0];
1435 if (overlap_communication_and_computation)
1436 R_rowptr_remote(lr) = update.v[1];
1439 const local_ordinal_type ri0 = lclrow2idx[lr];
1440 const local_ordinal_type pi0 = rowidx2part(ri0);
1442 size_type cnt_rowptr = R_rowptr(lr);
1443 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
1445 const size_type j0 = local_graph_rowptr(lr);
1446 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1447 const local_ordinal_type lc = local_graph_colidx(j);
1448 const local_ordinal_type lc2r = col2row[lc];
1450 const local_ordinal_type ri = lclrow2idx[lc2r];
1451 const local_ordinal_type pi = rowidx2part(ri);
1452 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1455 const local_ordinal_type row_entry = j - j0;
1456 if (!overlap_communication_and_computation || lc < nrows)
1457 R_A_colindsub(cnt_rowptr++) = row_entry;
1459 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1467 Kokkos::deep_copy(amd.rowptr, R_rowptr);
1468 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1469 if (overlap_communication_and_computation) {
1471 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1472 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1476 amd.tpetra_values = (
const_cast<block_crs_matrix_type*
>(A.get())->
1477 template getValues<memory_space>());
1485 template<
typename ArgActiveExecutionMemorySpace>
1490 typedef KokkosBatched::Experimental::Mode::Serial mode_type;
1491 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
1492 typedef KokkosBatched::Experimental::Algo::Level3::CompactMKL algo_type;
1494 typedef KokkosBatched::Experimental::Algo::Level3::Blocked algo_type;
1496 static int recommended_team_size(
const int ,
1504 #if defined(KOKKOS_ENABLE_CUDA)
1505 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
1506 const int vector_length,
1507 const int internal_vector_length) {
1508 const int vector_size = vector_length/internal_vector_length;
1509 int total_team_size(0);
1510 if (blksize <= 5) total_team_size = 32;
1511 else if (blksize <= 9) total_team_size = 64;
1512 else if (blksize <= 12) total_team_size = 96;
1513 else if (blksize <= 16) total_team_size = 128;
1514 else if (blksize <= 20) total_team_size = 160;
1515 else total_team_size = 160;
1516 return 2*total_team_size/vector_size;
1519 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
1520 typedef KokkosBatched::Experimental::Mode::Team mode_type;
1521 typedef KokkosBatched::Experimental::Algo::Level3::Unblocked algo_type;
1522 static int recommended_team_size(
const int blksize,
1523 const int vector_length,
1524 const int internal_vector_length) {
1525 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1529 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
1530 typedef KokkosBatched::Experimental::Mode::Team mode_type;
1531 typedef KokkosBatched::Experimental::Algo::Level3::Unblocked algo_type;
1532 static int recommended_team_size(
const int blksize,
1533 const int vector_length,
1534 const int internal_vector_length) {
1535 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1540 template<
typename MatrixType>
1541 struct ExtractAndFactorizeTridiags {
1543 using impl_type = ImplType<MatrixType>;
1545 using execution_space =
typename impl_type::execution_space;
1546 using memory_space =
typename impl_type::memory_space;
1548 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1551 using magnitude_type =
typename impl_type::magnitude_type;
1553 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1555 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1557 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
1559 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1560 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
1561 using impl_scalar_type_4d_view =
typename impl_type::impl_scalar_type_4d_view;
1562 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
1563 using impl_scalar_scratch_type_3d_view = Scratch<typename impl_type::impl_scalar_type_3d_view>;
1565 using internal_vector_type =
typename impl_type::internal_vector_type;
1566 static constexpr
int vector_length = impl_type::vector_length;
1567 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
1570 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1571 using member_type =
typename team_policy_type::member_type;
1575 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1576 const local_ordinal_type max_partsz;
1578 using a_rowptr_value_type =
typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::device_type>::size_type;
1579 const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,typename impl_type::device_type> > A_rowptr;
1580 const ConstUnmanaged<impl_scalar_type_1d_view> A_values;
1582 const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1583 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1584 const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
1585 const Unmanaged<impl_scalar_type_4d_view> scalar_values;
1587 const local_ordinal_type blocksize, blocksize_square;
1589 const magnitude_type tiny;
1590 const local_ordinal_type vector_loop_size;
1591 const local_ordinal_type vector_length_value;
1594 ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
1595 const PartInterface<MatrixType> &interf_,
1597 const magnitude_type& tiny_) :
1599 partptr(interf_.partptr),
1600 lclrow(interf_.lclrow),
1601 packptr(interf_.packptr),
1602 max_partsz(interf_.max_partsz),
1604 A_rowptr(A_->getCrsGraph().getLocalGraph().row_map),
1605 A_values(const_cast<block_crs_matrix_type*>(A_.get())->template getValues<memory_space>()),
1607 pack_td_ptr(btdm_.pack_td_ptr),
1608 flat_td_ptr(btdm_.flat_td_ptr),
1609 A_colindsub(btdm_.A_colindsub),
1610 internal_vector_values((internal_vector_type*)btdm_.values.data(),
1611 btdm_.values.extent(0),
1612 btdm_.values.extent(1),
1613 btdm_.values.extent(2),
1614 vector_length/internal_vector_length),
1615 scalar_values((impl_scalar_type*)btdm_.values.data(),
1616 btdm_.values.extent(0),
1617 btdm_.values.extent(1),
1618 btdm_.values.extent(2),
1620 blocksize(btdm_.values.extent(1)),
1621 blocksize_square(blocksize*blocksize),
1624 vector_loop_size(vector_length/internal_vector_length),
1625 vector_length_value(vector_length) {}
1629 KOKKOS_INLINE_FUNCTION
1631 extract(local_ordinal_type partidx,
1632 local_ordinal_type npacks)
const {
1633 const size_type kps = pack_td_ptr(partidx);
1634 local_ordinal_type kfs[vector_length] = {};
1635 local_ordinal_type ri0[vector_length] = {};
1636 local_ordinal_type nrows[vector_length] = {};
1638 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
1639 kfs[vi] = flat_td_ptr(partidx);
1640 ri0[vi] = partptr(partidx);
1641 nrows[vi] = partptr(partidx+1) - ri0[vi];
1643 for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
1644 for (local_ordinal_type e=0;e<3;++e) {
1645 const impl_scalar_type* block[vector_length] = {};
1646 for (local_ordinal_type vi=0;vi<npacks;++vi) {
1647 const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
1648 block[vi] = &A_values(Aj*blocksize_square);
1650 const size_type pi = kps + j;
1652 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
1653 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
1654 const auto idx = ii*blocksize + jj;
1655 auto& v = internal_vector_values(pi, ii, jj, 0);
1656 for (local_ordinal_type vi=0;vi<npacks;++vi)
1657 v[vi] = block[vi][idx];
1661 if (nrows[0] == 1)
break;
1662 if (e == 1 && (tr == 0 || tr+1 == nrows[0]))
break;
1663 for (local_ordinal_type vi=1;vi<npacks;++vi) {
1664 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
1673 KOKKOS_INLINE_FUNCTION
1675 extract(
const member_type &member,
1676 const local_ordinal_type &partidxbeg,
1677 const local_ordinal_type &npacks,
1678 const local_ordinal_type &vbeg)
const {
1679 local_ordinal_type kfs_vals[internal_vector_length] = {};
1680 local_ordinal_type ri0_vals[internal_vector_length] = {};
1681 local_ordinal_type nrows_vals[internal_vector_length] = {};
1683 const size_type kps = pack_td_ptr(partidxbeg);
1684 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
1685 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
1686 ri0_vals[vi] = partptr(partidxbeg+vi);
1687 nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
1690 local_ordinal_type j_vals[internal_vector_length] = {};
1691 for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
1692 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
1693 const local_ordinal_type nrows = nrows_vals[vi];
1695 auto &j = j_vals[vi];
1696 const local_ordinal_type kfs = kfs_vals[vi];
1697 const local_ordinal_type ri0 = ri0_vals[vi];
1698 const local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
1699 const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
1700 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
1701 const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
1702 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
1703 const size_type pi = kps + j;
1704 Kokkos::parallel_for
1705 (Kokkos::TeamThreadRange(member,blocksize),
1706 [&](
const local_ordinal_type &ii) {
1707 for (local_ordinal_type jj=0;jj<blocksize;++jj)
1708 scalar_values(pi, ii, jj, v) = block[ii*blocksize + jj];
1716 template<
typename AAViewType,
1717 typename WWViewType>
1718 KOKKOS_INLINE_FUNCTION
1720 factorize(
const member_type &member,
1721 const local_ordinal_type &i0,
1722 const local_ordinal_type &nrows,
1723 const local_ordinal_type &v,
1724 const AAViewType &AA,
1725 const WWViewType &WW)
const {
1726 namespace KB = KokkosBatched::Experimental;
1728 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
1729 <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
1730 typedef default_mode_and_algo_type::mode_type default_mode_type;
1731 typedef default_mode_and_algo_type::algo_type default_algo_type;
1734 const auto one = Kokkos::ArithTraits<magnitude_type>::one();
1737 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
1739 default_mode_type,KB::Algo::LU::Unblocked>
1740 ::invoke(member, A , tiny);
1744 local_ordinal_type i = i0;
1745 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
1746 B.assign_data( &AA(i+1,0,0,v) );
1747 KB::Trsm<member_type,
1748 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
1749 default_mode_type,default_algo_type>
1750 ::invoke(member, one, A, B);
1751 C.assign_data( &AA(i+2,0,0,v) );
1752 KB::Trsm<member_type,
1753 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
1754 default_mode_type,default_algo_type>
1755 ::invoke(member, one, A, C);
1756 A.assign_data( &AA(i+3,0,0,v) );
1757 KB::Gemm<member_type,
1758 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
1759 default_mode_type,default_algo_type>
1760 ::invoke(member, -one, C, B, one, A);
1762 default_mode_type,KB::Algo::LU::Unblocked>
1763 ::invoke(member, A, tiny);
1767 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
1768 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
1769 ::invoke(member, A, W);
1770 KB::SetIdentity<member_type,default_mode_type>
1771 ::invoke(member, A);
1772 member.team_barrier();
1773 KB::Trsm<member_type,
1774 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
1775 default_mode_type,default_algo_type>
1776 ::invoke(member, one, W, A);
1777 KB::Trsm<member_type,
1778 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
1779 default_mode_type,default_algo_type>
1780 ::invoke(member, one, W, A);
1786 struct ExtractAndFactorizeTag {};
1788 KOKKOS_INLINE_FUNCTION
1790 operator() (
const ExtractAndFactorizeTag &,
const member_type &member)
const {
1792 const local_ordinal_type packidx = member.league_rank();
1794 const local_ordinal_type partidx = packptr(packidx);
1795 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
1796 const local_ordinal_type i0 = pack_td_ptr(partidx);
1797 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
1799 internal_vector_scratch_type_3d_view
1800 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
1801 if (vector_loop_size == 1) {
1802 extract(partidx, npacks);
1803 factorize(member, i0, nrows, 0, internal_vector_values, WW);
1805 Kokkos::parallel_for
1806 (Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const local_ordinal_type &v) {
1807 const local_ordinal_type vbeg = v*internal_vector_length;
1809 extract(member, partidx+vbeg, npacks, vbeg);
1810 factorize(member, i0, nrows, v, internal_vector_values, WW);
1816 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(
"BlockTriDi::ExtractAdnFactorize");
1817 const local_ordinal_type team_size =
1818 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1819 recommended_team_size(blocksize, vector_length, internal_vector_length);
1820 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
1821 shmem_size(blocksize, blocksize, vector_loop_size);
1823 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
1824 policy(packptr.extent(0)-1, team_size, vector_loop_size);
1825 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
1826 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
1827 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *
this);
1829 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
1830 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
1833 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
1841 template<
typename MatrixType>
1844 const PartInterface<MatrixType> &interf,
1846 const typename ImplType<MatrixType>::magnitude_type tiny) {
1847 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::NumericPhase");
1848 ExtractAndFactorizeTridiags<MatrixType>
function(btdm, interf, A, tiny);
1855 template<
typename MatrixType>
1859 using execution_space =
typename impl_type::execution_space;
1861 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1863 using magnitude_type =
typename impl_type::magnitude_type;
1864 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
1865 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1866 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1867 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
1868 static constexpr
int vector_length = impl_type::vector_length;
1870 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
1874 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
1875 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
1876 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
1877 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
1878 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
1879 const local_ordinal_type blocksize;
1880 const local_ordinal_type num_vectors;
1883 Unmanaged<vector_type_3d_view> packed_multivector;
1884 Unmanaged<impl_scalar_type_2d_view> scalar_multivector;
1886 template<
typename TagType>
1887 KOKKOS_INLINE_FUNCTION
1888 void copy_multivectors(
const local_ordinal_type &j,
1889 const local_ordinal_type &vi,
1890 const local_ordinal_type &pri,
1891 const local_ordinal_type &ri0)
const {
1892 for (local_ordinal_type col=0;col<num_vectors;++col)
1893 for (local_ordinal_type i=0;i<blocksize;++i)
1894 packed_multivector(pri, i, col)[vi] = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1900 const vector_type_3d_view &pmv)
1901 : partptr(interf.partptr),
1902 packptr(interf.packptr),
1903 part2packrowidx0(interf.part2packrowidx0),
1904 part2rowidx0(interf.part2rowidx0),
1905 lclrow(interf.lclrow),
1906 blocksize(pmv.extent(1)),
1907 num_vectors(pmv.extent(2)),
1908 packed_multivector(pmv) {}
1913 operator() (
const local_ordinal_type &packidx)
const {
1914 local_ordinal_type partidx = packptr(packidx);
1915 local_ordinal_type npacks = packptr(packidx+1) - partidx;
1916 const local_ordinal_type pri0 = part2packrowidx0(partidx);
1918 local_ordinal_type ri0[vector_length] = {};
1919 local_ordinal_type nrows[vector_length] = {};
1920 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
1921 ri0[v] = part2rowidx0(partidx);
1922 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
1924 for (local_ordinal_type j=0;j<nrows[0];++j) {
1925 local_ordinal_type cnt = 1;
1926 for (;cnt<npacks && j!= nrows[cnt];++cnt);
1928 const local_ordinal_type pri = pri0 + j;
1929 for (local_ordinal_type col=0;col<num_vectors;++col)
1930 for (local_ordinal_type i=0;i<blocksize;++i)
1931 for (local_ordinal_type v=0;v<npacks;++v)
1932 packed_multivector(pri, i, col)[v] = scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col);
1936 KOKKOS_INLINE_FUNCTION
1938 operator() (
const member_type &member)
const {
1939 const local_ordinal_type packidx = member.league_rank();
1940 const local_ordinal_type partidx_begin = packptr(packidx);
1941 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
1942 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
1943 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
1944 const local_ordinal_type partidx = partidx_begin + v;
1945 const local_ordinal_type ri0 = part2rowidx0(partidx);
1946 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
1949 const local_ordinal_type pri = pri0;
1950 for (local_ordinal_type col=0;col<num_vectors;++col) {
1951 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
1952 packed_multivector(pri, i, col)[v] = scalar_multivector(blocksize*lclrow(ri0)+i,col);
1956 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
1957 const local_ordinal_type pri = pri0 + j;
1958 for (local_ordinal_type col=0;col<num_vectors;++col)
1959 for (local_ordinal_type i=0;i<blocksize;++i)
1960 packed_multivector(pri, i, col)[v] = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1966 template<
typename TpetraLocalViewType>
1967 void run(
const TpetraLocalViewType &scalar_multivector_) {
1968 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(
"BlockTriDi::MultiVectorConverter");
1970 scalar_multivector = scalar_multivector_;
1972 #if defined(KOKKOS_ENABLE_CUDA)
1973 const local_ordinal_type vl = vector_length;
1974 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1975 Kokkos::parallel_for
1976 (
"MultiVectorConverter::TeamPolicy", policy, *
this);
1979 #if defined(__CUDA_ARCH__)
1982 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
1983 Kokkos::parallel_for
1984 (
"MultiVectorConverter::RangePolicy", policy, *
this);
1987 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
1994 template<
typename ArgActiveExecutionMemorySpace>
1999 typedef KokkosBatched::Experimental::Mode::Serial mode_type;
2000 typedef KokkosBatched::Experimental::Algo::Level2::Unblocked single_vector_algo_type;
2001 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2002 typedef KokkosBatched::Experimental::Algo::Level3::CompactMKL multi_vector_algo_type;
2004 typedef KokkosBatched::Experimental::Algo::Level3::Blocked multi_vector_algo_type;
2006 static int recommended_team_size(
const int ,
2013 #if defined(KOKKOS_ENABLE_CUDA)
2014 static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
2015 const int vector_length,
2016 const int internal_vector_length) {
2017 const int vector_size = vector_length/internal_vector_length;
2018 int total_team_size(0);
2019 if (blksize <= 5) total_team_size = 32;
2020 else if (blksize <= 9) total_team_size = 64;
2021 else if (blksize <= 12) total_team_size = 96;
2022 else if (blksize <= 16) total_team_size = 128;
2023 else if (blksize <= 20) total_team_size = 160;
2024 else total_team_size = 160;
2025 return total_team_size/vector_size;
2029 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2030 typedef KokkosBatched::Experimental::Mode::Team mode_type;
2031 typedef KokkosBatched::Experimental::Algo::Level2::Unblocked single_vector_algo_type;
2032 typedef KokkosBatched::Experimental::Algo::Level3::Unblocked multi_vector_algo_type;
2033 static int recommended_team_size(
const int blksize,
2034 const int vector_length,
2035 const int internal_vector_length) {
2036 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2040 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2041 typedef KokkosBatched::Experimental::Mode::Team mode_type;
2042 typedef KokkosBatched::Experimental::Algo::Level2::Unblocked single_vector_algo_type;
2043 typedef KokkosBatched::Experimental::Algo::Level3::Unblocked multi_vector_algo_type;
2044 static int recommended_team_size(
const int blksize,
2045 const int vector_length,
2046 const int internal_vector_length) {
2047 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2052 template<
typename MatrixType>
2053 struct SolveTridiags {
2055 using impl_type = ImplType<MatrixType>;
2056 using execution_space =
typename impl_type::execution_space;
2058 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2061 using magnitude_type =
typename impl_type::magnitude_type;
2063 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2066 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2067 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
2068 using impl_scalar_type_4d_view =
typename impl_type::impl_scalar_type_4d_view;
2070 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2072 using internal_vector_type =
typename impl_type::internal_vector_type;
2073 static constexpr
int vector_length = impl_type::vector_length;
2074 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
2077 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
2078 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
2081 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2082 using member_type =
typename team_policy_type::member_type;
2086 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2087 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2088 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2089 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2092 const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2095 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
2096 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
2098 const local_ordinal_type vector_loop_size;
2101 Unmanaged<impl_scalar_type_2d_view> Y_scalar_multivector;
2102 #if defined(__CUDA_ARCH__)
2103 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2105 Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2107 const impl_scalar_type df;
2108 const bool compute_diff;
2111 SolveTridiags(
const PartInterface<MatrixType> &interf,
2112 const BlockTridiags<MatrixType> &btdm,
2113 const vector_type_3d_view &pmv,
2114 const impl_scalar_type damping_factor,
2115 const bool is_norm_manager_active)
2118 partptr(interf.partptr),
2119 packptr(interf.packptr),
2120 part2packrowidx0(interf.part2packrowidx0),
2121 lclrow(interf.lclrow),
2123 pack_td_ptr(btdm.pack_td_ptr),
2124 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
2125 btdm.values.extent(0),
2126 btdm.values.extent(1),
2127 btdm.values.extent(2),
2128 vector_length/internal_vector_length),
2129 X_internal_vector_values((internal_vector_type*)pmv.data(),
2133 vector_length/internal_vector_length),
2134 vector_loop_size(vector_length/internal_vector_length),
2135 Y_scalar_multivector(),
2138 compute_diff(is_norm_manager_active)
2144 KOKKOS_INLINE_FUNCTION
2146 copyToFlatMultiVector(
const member_type &member,
2147 const local_ordinal_type partidxbeg,
2148 const local_ordinal_type npacks,
2149 const local_ordinal_type pri0,
2150 const local_ordinal_type v,
2151 const local_ordinal_type blocksize,
2152 const local_ordinal_type num_vectors)
const {
2153 const local_ordinal_type vbeg = v*internal_vector_length;
2154 if (vbeg < npacks) {
2155 local_ordinal_type ri0_vals[internal_vector_length] = {};
2156 local_ordinal_type nrows_vals[internal_vector_length] = {};
2157 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2158 const local_ordinal_type partidx = partidxbeg+vv;
2159 ri0_vals[vi] = partptr(partidx);
2160 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
2163 impl_scalar_type z_partial_sum(0);
2164 if (nrows_vals[0] == 1) {
2165 const local_ordinal_type j=0, pri=pri0;
2167 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2168 const local_ordinal_type ri0 = ri0_vals[vi];
2169 const local_ordinal_type nrows = nrows_vals[vi];
2171 Kokkos::parallel_for
2172 (Kokkos::TeamThreadRange(member, blocksize),
2173 [&](
const local_ordinal_type &i) {
2174 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2175 for (local_ordinal_type col=0;col<num_vectors;++col) {
2176 impl_scalar_type &y = Y_scalar_multivector(row,col);
2177 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2181 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2182 z_partial_sum += yd_abs*yd_abs;
2190 Kokkos::parallel_for
2191 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
2192 [&](
const local_ordinal_type &j) {
2193 const local_ordinal_type pri = pri0 + j;
2194 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2195 const local_ordinal_type ri0 = ri0_vals[vi];
2196 const local_ordinal_type nrows = nrows_vals[vi];
2198 for (local_ordinal_type col=0;col<num_vectors;++col) {
2199 for (local_ordinal_type i=0;i<blocksize;++i) {
2200 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2201 impl_scalar_type &y = Y_scalar_multivector(row,col);
2202 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2206 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2207 z_partial_sum += yd_abs*yd_abs;
2216 Z_scalar_vector(member.league_rank()) += z_partial_sum;
2223 template<
typename WWViewType>
2224 KOKKOS_INLINE_FUNCTION
2226 solveSingleVector(
const member_type &member,
2227 const local_ordinal_type &blocksize,
2228 const local_ordinal_type &i0,
2229 const local_ordinal_type &r0,
2230 const local_ordinal_type &nrows,
2231 const local_ordinal_type &v,
2232 const WWViewType &WW)
const {
2233 namespace KB = KokkosBatched::Experimental;
2234 typedef SolveTridiagsDefaultModeAndAlgo
2235 <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2236 typedef default_mode_and_algo_type::mode_type default_mode_type;
2237 typedef default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2240 auto A = D_internal_vector_values.data();
2241 auto X = X_internal_vector_values.data();
2244 const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2245 const auto zero = Kokkos::ArithTraits<magnitude_type>::zero();
2249 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2250 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
2251 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
2252 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2253 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
2266 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2267 (default_mode_type,default_algo_type,
2270 blocksize,blocksize,
2275 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2276 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2277 (default_mode_type,default_algo_type,
2279 blocksize, blocksize,
2281 A+2*astep, as0, as1,
2286 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2287 (default_mode_type,default_algo_type,
2290 blocksize,blocksize,
2292 A+3*astep, as0, as1,
2300 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2301 (default_mode_type,default_algo_type,
2304 blocksize, blocksize,
2309 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2311 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2312 (default_mode_type,default_algo_type,
2314 blocksize, blocksize,
2316 A+1*astep, as0, as1,
2321 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2322 (default_mode_type,default_algo_type,
2325 blocksize, blocksize,
2335 const local_ordinal_type ws0 = WW.stride_0();
2336 auto W = WW.data() + v;
2337 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2339 member, blocksize, X, xs0, W, ws0);
2340 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2341 (default_mode_type,default_algo_type,
2343 blocksize, blocksize,
2352 template<
typename WWViewType>
2353 KOKKOS_INLINE_FUNCTION
2355 solveMultiVector(
const member_type &member,
2356 const local_ordinal_type &,
2357 const local_ordinal_type &i0,
2358 const local_ordinal_type &r0,
2359 const local_ordinal_type &nrows,
2360 const local_ordinal_type &v,
2361 const WWViewType &WW)
const {
2362 namespace KB = KokkosBatched::Experimental;
2363 typedef SolveTridiagsDefaultModeAndAlgo
2364 <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2365 typedef default_mode_and_algo_type::mode_type default_mode_type;
2366 typedef default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2369 const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2370 const auto zero = Kokkos::ArithTraits<magnitude_type>::zero();
2373 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2374 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2377 local_ordinal_type i = i0, r = r0;
2382 KB::Trsm<member_type,
2383 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2384 default_mode_type,default_algo_type>
2385 ::invoke(member, one, A, X1);
2386 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2387 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2388 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2389 KB::Gemm<member_type,
2390 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2391 default_mode_type,default_algo_type>
2392 ::invoke(member, -one, A, X1, one, X2);
2393 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2394 KB::Trsm<member_type,
2395 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2396 default_mode_type,default_algo_type>
2397 ::invoke(member, one, A, X2);
2398 X1.assign_data( X2.data() );
2402 KB::Trsm<member_type,
2403 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2404 default_mode_type,default_algo_type>
2405 ::invoke(member, one, A, X1);
2406 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2408 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2409 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2410 KB::Gemm<member_type,
2411 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2412 default_mode_type,default_algo_type>
2413 ::invoke(member, -one, A, X1, one, X2);
2414 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2415 KB::Trsm<member_type,
2416 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2417 default_mode_type,default_algo_type>
2418 ::invoke(member, one, A, X2);
2419 X1.assign_data( X2.data() );
2423 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2424 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2425 ::invoke(member, X1, W);
2426 KB::Gemm<member_type,
2427 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2428 default_mode_type,default_algo_type>
2429 ::invoke(member, one, A, W, zero, X1);
2433 template<
int B>
struct SingleVectorTag {};
2434 template<
int B>
struct MultiVectorTag {};
2437 KOKKOS_INLINE_FUNCTION
2439 operator() (
const SingleVectorTag<B> &,
const member_type &member)
const {
2440 const local_ordinal_type packidx = member.league_rank();
2441 const local_ordinal_type partidx = packptr(packidx);
2442 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2443 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2444 const local_ordinal_type i0 = pack_td_ptr(partidx);
2445 const local_ordinal_type r0 = part2packrowidx0(partidx);
2446 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2447 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2448 const local_ordinal_type num_vectors = 1;
2449 internal_vector_scratch_type_3d_view
2450 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
2451 Kokkos::single(Kokkos::PerTeam(member), [&]() {
2452 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2454 Kokkos::parallel_for
2455 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
2456 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
2457 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2462 KOKKOS_INLINE_FUNCTION
2464 operator() (
const MultiVectorTag<B> &,
const member_type &member)
const {
2465 const local_ordinal_type packidx = member.league_rank();
2466 const local_ordinal_type partidx = packptr(packidx);
2467 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2468 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2469 const local_ordinal_type i0 = pack_td_ptr(partidx);
2470 const local_ordinal_type r0 = part2packrowidx0(partidx);
2471 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2472 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2473 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2475 internal_vector_scratch_type_3d_view
2476 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
2477 Kokkos::single(Kokkos::PerTeam(member), [&]() {
2478 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2480 Kokkos::parallel_for
2481 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
2482 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
2483 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2487 void run(
const impl_scalar_type_2d_view &Y,
2488 const impl_scalar_type_1d_view &Z) {
2489 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(
"BlockTriDi::SolveTridiags");
2492 this->Y_scalar_multivector = Y;
2493 this->Z_scalar_vector = Z;
2495 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2496 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
2498 const local_ordinal_type team_size =
2499 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2500 recommended_team_size(blocksize, vector_length, internal_vector_length);
2501 const int per_team_scratch = internal_vector_scratch_type_3d_view
2502 ::shmem_size(blocksize, num_vectors, vector_loop_size);
2504 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2505 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2506 if (num_vectors == 1) { \
2507 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2508 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2509 Kokkos::parallel_for \
2510 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2511 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2513 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2514 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2515 Kokkos::parallel_for \
2516 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2517 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2520 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2521 if (num_vectors == 1) { \
2522 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2523 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2524 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2525 Kokkos::parallel_for \
2526 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2529 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2530 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2531 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2532 Kokkos::parallel_for \
2533 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2537 switch (blocksize) {
2538 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2539 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2540 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2541 case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2542 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2543 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2544 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2545 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2546 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2547 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2549 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
2551 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2559 const int team_size) {
2560 int total_team_size(0);
2561 if (blksize <= 5) total_team_size = 32;
2562 else if (blksize <= 9) total_team_size = 64;
2563 else if (blksize <= 12) total_team_size = 96;
2564 else if (blksize <= 16) total_team_size = 128;
2565 else if (blksize <= 20) total_team_size = 160;
2566 else total_team_size = 160;
2567 return total_team_size/team_size;
2570 template<
typename MatrixType>
2571 struct ComputeResidualVector {
2573 using impl_type = ImplType<MatrixType>;
2574 using execution_space =
typename impl_type::execution_space;
2575 using memory_space =
typename impl_type::memory_space;
2577 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2580 using magnitude_type =
typename impl_type::magnitude_type;
2582 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2584 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
2585 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
2586 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
2587 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2588 using impl_scalar_type_4d_view =
typename impl_type::impl_scalar_type_4d_view;
2589 static constexpr
int vector_length = impl_type::vector_length;
2592 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
2595 enum :
int { max_blocksize = 32 };
2598 ConstUnmanaged<impl_scalar_type_2d_view> b;
2599 ConstUnmanaged<impl_scalar_type_2d_view> x;
2600 ConstUnmanaged<impl_scalar_type_2d_view> x_remote;
2601 Unmanaged<impl_scalar_type_2d_view> y;
2602 Unmanaged<vector_type_3d_view> y_packed;
2603 Unmanaged<impl_scalar_type_4d_view> y_packed_scalar;
2606 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
2607 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
2608 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
2612 using a_rowptr_value_type =
typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::device_type>::size_type;
2613 const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,typename impl_type::device_type> > A_rowptr;
2614 const ConstUnmanaged<local_ordinal_type_1d_view> A_colind;
2617 const local_ordinal_type blocksize_requested;
2620 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2621 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2622 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
2623 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2624 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2625 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
2626 const bool is_dm2cm_active;
2629 template<
typename LocalCrsGraphType>
2630 ComputeResidualVector(
const AmD<MatrixType> &amd,
2631 const LocalCrsGraphType &graph,
2632 const local_ordinal_type &blocksize_requested_,
2633 const PartInterface<MatrixType> &interf,
2634 const local_ordinal_type_1d_view &dm2cm_)
2635 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
2636 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
2637 tpetra_values(amd.tpetra_values),
2638 A_rowptr(graph.row_map),
2639 A_colind(graph.entries),
2640 blocksize_requested(blocksize_requested_),
2641 part2packrowidx0(interf.part2packrowidx0),
2642 part2rowidx0(interf.part2rowidx0),
2643 rowidx2part(interf.rowidx2part),
2644 partptr(interf.partptr),
2645 lclrow(interf.lclrow),
2647 is_dm2cm_active(dm2cm_.span() > 0)
2653 SerialGemv(
const local_ordinal_type &blocksize,
2654 const impl_scalar_type *
const KOKKOS_RESTRICT AA,
2655 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
2656 impl_scalar_type * KOKKOS_RESTRICT yy)
const {
2657 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
2658 impl_scalar_type val = 0;
2659 const local_ordinal_type offset = k0*blocksize;
2660 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
2663 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
2666 for (local_ordinal_type k1=0;k1<blocksize;++k1)
2667 val += AA[offset+k1]*xx[k1];
2672 template<
typename bbViewType,
typename yyViewType>
2673 KOKKOS_INLINE_FUNCTION
2675 VectorCopy(
const member_type &member,
2676 const local_ordinal_type &blocksize,
2677 const bbViewType &bb,
2678 const yyViewType &yy)
const {
2679 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0) {
2684 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
2685 KOKKOS_INLINE_FUNCTION
2687 TeamVectorGemv(
const member_type &member,
2688 const local_ordinal_type &blocksize,
2689 const AAViewType &AA,
2690 const xxViewType &xx,
2691 const yyViewType &yy)
const {
2692 Kokkos::parallel_for
2693 (Kokkos::TeamThreadRange(member, blocksize),
2694 [&](
const local_ordinal_type &k0) {
2695 impl_scalar_type val = 0;
2696 Kokkos::parallel_for
2697 (Kokkos::ThreadVectorRange(member, blocksize),
2698 [&](
const local_ordinal_type &k1) {
2699 val += AA(k0,k1)*xx(k1);
2701 Kokkos::atomic_fetch_add(&yy(k0), -val);
2705 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
2706 KOKKOS_INLINE_FUNCTION
2708 VectorGemv(
const member_type &member,
2709 const local_ordinal_type &blocksize,
2710 const AAViewType &AA,
2711 const xxViewType &xx,
2712 const yyViewType &yy)
const {
2713 Kokkos::parallel_for
2714 (Kokkos::ThreadVectorRange(member, blocksize),
2715 [&](
const local_ordinal_type &k0) {
2716 impl_scalar_type val(0);
2717 for (local_ordinal_type k1=0;k1<blocksize;++k1) {
2718 val += AA(k0,k1)*xx(k1);
2720 Kokkos::atomic_fetch_add(&yy(k0), -val);
2747 operator() (
const SeqTag &,
const local_ordinal_type& i)
const {
2748 const local_ordinal_type blocksize = blocksize_requested;
2749 const local_ordinal_type blocksize_square = blocksize*blocksize;
2752 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2753 const local_ordinal_type num_vectors = y.extent(1);
2754 const local_ordinal_type row = i*blocksize;
2755 for (local_ordinal_type col=0;col<num_vectors;++col) {
2757 impl_scalar_type *yy = &y(row, col);
2758 const impl_scalar_type *
const bb = &b(row, col);
2759 memcpy(yy, bb,
sizeof(impl_scalar_type)*blocksize);
2762 const size_type A_k0 = A_rowptr[i];
2763 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
2764 const size_type j = A_k0 + colindsub[k];
2765 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
2766 const impl_scalar_type *
const xx = &x(A_colind[j]*blocksize, col);
2767 SerialGemv(blocksize,AA,xx,yy);
2772 KOKKOS_INLINE_FUNCTION
2774 operator() (
const SeqTag &,
const member_type &member)
const {
2775 namespace KB = KokkosBatched::Experimental;
2778 const local_ordinal_type blocksize = blocksize_requested;
2779 const local_ordinal_type blocksize_square = blocksize*blocksize;
2781 const local_ordinal_type lr = member.league_rank();
2782 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2783 const local_ordinal_type num_vectors = y.extent(1);
2786 auto bb = Kokkos::subview(b, block_range, 0);
2788 auto yy = Kokkos::subview(y, block_range, 0);
2789 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2791 const local_ordinal_type row = lr*blocksize;
2792 for (local_ordinal_type col=0;col<num_vectors;++col) {
2794 yy.assign_data(&y(row, col));
2795 bb.assign_data(&b(row, col));
2796 if (member.team_rank() == 0)
2797 VectorCopy(member, blocksize, bb, yy);
2798 member.team_barrier();
2801 const size_type A_k0 = A_rowptr[lr];
2802 Kokkos::parallel_for
2803 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
2804 [&](
const local_ordinal_type &k) {
2805 const size_type j = A_k0 + colindsub[k];
2806 A_block.assign_data( &tpetra_values(j*blocksize_square) );
2807 xx.assign_data( &x(A_colind[j]*blocksize, col) );
2808 VectorGemv(member, blocksize, A_block, xx, yy);
2819 operator() (
const AsyncTag<B> &,
const local_ordinal_type &rowidx)
const {
2820 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2821 const local_ordinal_type blocksize_square = blocksize*blocksize;
2824 const local_ordinal_type partidx = rowidx2part(rowidx);
2825 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2826 const local_ordinal_type v = partidx % vector_length;
2828 const local_ordinal_type num_vectors = y_packed.extent(2);
2829 const local_ordinal_type num_local_rows = lclrow.extent(0);
2832 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
2834 const local_ordinal_type lr = lclrow(rowidx);
2835 const local_ordinal_type row = lr*blocksize;
2836 for (local_ordinal_type col=0;col<num_vectors;++col) {
2838 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
2841 const size_type A_k0 = A_rowptr[lr];
2842 for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
2843 const size_type j = A_k0 + colindsub[k];
2844 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
2845 const local_ordinal_type A_colind_at_j = A_colind[j];
2846 if (A_colind_at_j < num_local_rows) {
2847 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2848 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
2849 SerialGemv(blocksize, AA,xx,yy);
2851 const auto loc = A_colind_at_j - num_local_rows;
2852 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
2853 SerialGemv(blocksize, AA,xx_remote,yy);
2857 for (local_ordinal_type k=0;k<blocksize;++k)
2858 y_packed(pri, k, col)[v] = yy[k];
2863 KOKKOS_INLINE_FUNCTION
2865 operator() (
const AsyncTag<B> &,
const member_type &member)
const {
2866 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2867 const local_ordinal_type blocksize_square = blocksize*blocksize;
2870 const local_ordinal_type rowidx = member.league_rank();
2871 const local_ordinal_type partidx = rowidx2part(rowidx);
2872 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2873 const local_ordinal_type v = partidx % vector_length;
2875 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2876 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
2877 const local_ordinal_type num_local_rows = lclrow.extent(0);
2880 auto bb = Kokkos::subview(b, block_range, 0);
2881 auto xx = Kokkos::subview(x, block_range, 0);
2882 auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
2883 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
2884 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2886 const local_ordinal_type lr = lclrow(rowidx);
2887 const local_ordinal_type row = lr*blocksize;
2888 for (local_ordinal_type col=0;col<num_vectors;++col) {
2890 bb.assign_data(&b(row, col));
2891 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
2892 if (member.team_rank() == 0)
2893 VectorCopy(member, blocksize, bb, yy);
2894 member.team_barrier();
2897 const size_type A_k0 = A_rowptr[lr];
2898 Kokkos::parallel_for
2899 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
2900 [&](
const local_ordinal_type &k) {
2901 const size_type j = A_k0 + colindsub[k];
2902 A_block.assign_data( &tpetra_values(j*blocksize_square) );
2904 const local_ordinal_type A_colind_at_j = A_colind[j];
2905 if (A_colind_at_j < num_local_rows) {
2906 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2907 xx.assign_data( &x(loc*blocksize, col) );
2908 VectorGemv(member, blocksize, A_block, xx, yy);
2910 const auto loc = A_colind_at_j - num_local_rows;
2911 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
2912 VectorGemv(member, blocksize, A_block, xx_remote, yy);
2918 template <
int P,
int B>
struct OverlapTag {};
2920 template<
int P,
int B>
2923 operator() (
const OverlapTag<P,B> &,
const local_ordinal_type& rowidx)
const {
2924 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2925 const local_ordinal_type blocksize_square = blocksize*blocksize;
2928 const local_ordinal_type partidx = rowidx2part(rowidx);
2929 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2930 const local_ordinal_type v = partidx % vector_length;
2932 const local_ordinal_type num_vectors = y_packed.extent(2);
2933 const local_ordinal_type num_local_rows = lclrow.extent(0);
2936 impl_scalar_type yy[max_blocksize] = {};
2938 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
2939 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
2941 const local_ordinal_type lr = lclrow(rowidx);
2942 const local_ordinal_type row = lr*blocksize;
2943 for (local_ordinal_type col=0;col<num_vectors;++col) {
2946 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
2949 memset(yy, 0,
sizeof(impl_scalar_type)*blocksize);
2953 const size_type A_k0 = A_rowptr[lr];
2954 for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
2955 const size_type j = A_k0 + colindsub_used[k];
2956 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
2957 const local_ordinal_type A_colind_at_j = A_colind[j];
2959 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2960 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
2961 SerialGemv(blocksize,AA,xx,yy);
2963 const auto loc = A_colind_at_j - num_local_rows;
2964 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
2965 SerialGemv(blocksize,AA,xx_remote,yy);
2970 for (local_ordinal_type k=0;k<blocksize;++k)
2971 y_packed(pri, k, col)[v] = yy[k];
2973 for (local_ordinal_type k=0;k<blocksize;++k)
2974 y_packed(pri, k, col)[v] += yy[k];
2979 template<
int P,
int B>
2980 KOKKOS_INLINE_FUNCTION
2982 operator() (
const OverlapTag<P,B> &,
const member_type &member)
const {
2983 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2984 const local_ordinal_type blocksize_square = blocksize*blocksize;
2987 const local_ordinal_type rowidx = member.league_rank();
2988 const local_ordinal_type partidx = rowidx2part(rowidx);
2989 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2990 const local_ordinal_type v = partidx % vector_length;
2992 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2993 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
2994 const local_ordinal_type num_local_rows = lclrow.extent(0);
2997 auto bb = Kokkos::subview(b, block_range, 0);
2999 auto xx_remote = bb;
3000 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3001 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3002 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3003 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3005 const local_ordinal_type lr = lclrow(rowidx);
3006 const local_ordinal_type row = lr*blocksize;
3007 for (local_ordinal_type col=0;col<num_vectors;++col) {
3008 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3011 bb.assign_data(&b(row, col));
3012 if (member.team_rank() == 0)
3013 VectorCopy(member, blocksize, bb, yy);
3014 member.team_barrier();
3018 const size_type A_k0 = A_rowptr[lr];
3019 Kokkos::parallel_for
3020 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
3021 [&](
const local_ordinal_type &k) {
3022 const size_type j = A_k0 + colindsub_used[k];
3023 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3025 const local_ordinal_type A_colind_at_j = A_colind[j];
3027 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3028 xx.assign_data( &x(loc*blocksize, col) );
3029 VectorGemv(member, blocksize, A_block, xx, yy);
3031 const auto loc = A_colind_at_j - num_local_rows;
3032 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3033 VectorGemv(member, blocksize, A_block, xx_remote, yy);
3040 template<
typename MultiVectorLocalViewTypeY,
3041 typename MultiVectorLocalViewTypeB,
3042 typename MultiVectorLocalViewTypeX>
3043 void run(
const MultiVectorLocalViewTypeY &y_,
3044 const MultiVectorLocalViewTypeB &b_,
3045 const MultiVectorLocalViewTypeX &x_) {
3046 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(
"BlockTriDi::ComputeResidual::<SeqTag>");
3048 y = y_; b = b_; x = x_;
3049 if (is_cuda<execution_space>::value) {
3050 #if defined(KOKKOS_ENABLE_CUDA)
3051 const local_ordinal_type blocksize = blocksize_requested;
3052 const local_ordinal_type team_size = 8;
3054 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3055 Kokkos::parallel_for
3056 (
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, *
this);
3059 #if defined(__CUDA_ARCH__)
3062 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
3063 Kokkos::parallel_for
3064 (
"ComputeResidual::RangePolicy::run<SeqTag>", policy, *
this);
3067 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3071 template<
typename MultiVectorLocalViewTypeB,
3072 typename MultiVectorLocalViewTypeX,
3073 typename MultiVectorLocalViewTypeX_Remote>
3074 void run(
const vector_type_3d_view &y_packed_,
3075 const MultiVectorLocalViewTypeB &b_,
3076 const MultiVectorLocalViewTypeX &x_,
3077 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
3078 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(
"BlockTriDi::ComputeResidual::<AsyncTag>");
3080 b = b_; x = x_; x_remote = x_remote_;
3081 if (is_cuda<execution_space>::value) {
3082 #if defined(KOKKOS_ENABLE_CUDA)
3083 y_packed_scalar = impl_scalar_type_4d_view((impl_scalar_type*)y_packed_.data(),
3084 y_packed_.extent(0),
3085 y_packed_.extent(1),
3086 y_packed_.extent(2),
3090 y_packed = y_packed_;
3093 if (is_cuda<execution_space>::value) {
3094 #if defined(KOKKOS_ENABLE_CUDA)
3095 const local_ordinal_type blocksize = blocksize_requested;
3096 const local_ordinal_type team_size = 8;
3102 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3103 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3104 policy(rowidx2part.extent(0), team_size, vector_size); \
3105 Kokkos::parallel_for \
3106 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3107 policy, *this); } break
3108 switch (blocksize_requested) {
3109 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3110 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3111 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3112 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3113 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3114 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3115 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3116 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3117 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3118 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3120 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3123 #if defined(__CUDA_ARCH__)
3126 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3127 const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3128 Kokkos::parallel_for \
3129 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3130 policy, *this); } break
3131 switch (blocksize_requested) {
3132 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3133 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3134 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3135 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3136 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3137 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3138 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3139 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3140 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3141 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3143 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3146 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3150 template<
typename MultiVectorLocalViewTypeB,
3151 typename MultiVectorLocalViewTypeX,
3152 typename MultiVectorLocalViewTypeX_Remote>
3153 void run(
const vector_type_3d_view &y_packed_,
3154 const MultiVectorLocalViewTypeB &b_,
3155 const MultiVectorLocalViewTypeX &x_,
3156 const MultiVectorLocalViewTypeX_Remote &x_remote_,
3157 const bool compute_owned) {
3158 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(
"BlockTriDi::ComputeResidual::<OverlapTag>");
3160 b = b_; x = x_; x_remote = x_remote_;
3161 if (is_cuda<execution_space>::value) {
3162 #if defined(KOKKOS_ENABLE_CUDA)
3163 y_packed_scalar = impl_scalar_type_4d_view((impl_scalar_type*)y_packed_.data(),
3164 y_packed_.extent(0),
3165 y_packed_.extent(1),
3166 y_packed_.extent(2),
3170 y_packed = y_packed_;
3173 if (is_cuda<execution_space>::value) {
3174 #if defined(KOKKOS_ENABLE_CUDA)
3175 const local_ordinal_type blocksize = blocksize_requested;
3176 const local_ordinal_type team_size = 8;
3182 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3183 if (compute_owned) { \
3184 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3185 policy(rowidx2part.extent(0), team_size, vector_size); \
3186 Kokkos::parallel_for \
3187 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3189 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3190 policy(rowidx2part.extent(0), team_size, vector_size); \
3191 Kokkos::parallel_for \
3192 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3194 switch (blocksize_requested) {
3195 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3196 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3197 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3198 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3199 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3200 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3201 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3202 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3203 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3204 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3206 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3209 #if defined(__CUDA_ARCH__)
3212 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3213 if (compute_owned) { \
3214 const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
3215 policy(0, rowidx2part.extent(0)); \
3216 Kokkos::parallel_for \
3217 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3219 const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
3220 policy(0, rowidx2part.extent(0)); \
3221 Kokkos::parallel_for \
3222 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3225 switch (blocksize_requested) {
3226 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3227 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3228 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3229 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3230 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3231 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3232 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3233 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3234 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3235 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3237 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3240 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3244 template<
typename MatrixType>
3245 void reduceVector(
const ConstUnmanaged<
typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
3246 const typename ImplType<MatrixType>::local_ordinal_type ncols,
3247 typename ImplType<MatrixType>::magnitude_type *vals) {
3248 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(
"BlockTriDi::ReduceVector");
3249 using impl_type = ImplType<MatrixType>;
3250 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3251 using impl_scalar_type =
typename impl_type::impl_scalar_type;
3252 using magnitude_type =
typename impl_type::magnitude_type;
3254 const auto norm2 = KokkosBlas::nrm1(zz);
3256 impl_scalar_type norm2(0);
3257 Kokkos::parallel_reduce
3258 (
"ReduceMultiVector::Device",
3259 Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
3260 KOKKOS_LAMBDA(
const local_ordinal_type &i, impl_scalar_type &update) {
3264 const auto norm = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2)/magnitude_type(ncols);
3265 for (local_ordinal_type j=0;j<ncols;++j) vals[j] = norm;
3267 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3273 template<
typename MatrixType>
3278 using magnitude_type =
typename impl_type::magnitude_type;
3282 int sweep_step_, num_vectors_;
3283 #ifdef HAVE_IFPACK2_MPI
3284 MPI_Request mpi_request_;
3287 magnitude_type work_[3];
3294 collective_ = comm->getSize() > 1;
3296 #ifdef HAVE_IFPACK2_MPI
3297 const auto mpi_comm = Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int> >(comm);
3299 comm_ = *mpi_comm->getRawMpiComm();
3305 void setCheckFrequency(
const int sweep_step) {
3307 sweep_step_ = sweep_step;
3310 // Get the buffer into which to store rank-local squared norms.
3311 magnitude_type* getBuffer() { return &work_[0]; }
3313 // Call MPI_Iallreduce to find the global squared norms.
3314 void ireduce(const int sweep, const bool force = false) {
3315 if ( ! force && sweep % sweep_step_) return;
3317 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce
");
3318 auto send_data = &work_[0] + 1;
3319 auto recv_data = &work_[0];
3321 work_[1] = work_[0];
3322 #ifdef HAVE_IFPACK2_MPI
3323 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3324 MPI_Iallreduce(send_data, recv_data, 1,
3325 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3326 MPI_SUM, comm_, &mpi_request_);
3328 MPI_Allreduce (send_data, recv_data, 1,
3329 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3336 // Check if the norm-based termination criterion is met. tol2 is the
3337 // tolerance squared. Sweep is the sweep index. If not every iteration is
3338 // being checked, this function immediately returns false. If a check must
3339 // be done at this iteration, it waits for the reduction triggered by
3340 // ireduce to complete, then checks the global norm against the tolerance.
3341 bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
3343 if (sweep <= 0) return false;
3345 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone
");
3347 TEUCHOS_ASSERT(sweep >= 1);
3348 if ( ! force && (sweep - 1) % sweep_step_) return false;
3350 #ifdef HAVE_IFPACK2_MPI
3351 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3352 MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3360 work_[2] = work_[0];
3362 r_val = (work_[0] < tol2*work_[2]);
3367 // After termination has occurred, finalize the norms for use in
3368 // get_norms{0,final}.
3370 work_[0] = std::sqrt(work_[0]); // after converged
3371 work_[2] = std::sqrt(work_[2]); // first norm
3374 // Report norms to the caller.
3375 const magnitude_type* getNorms0 () const { return &work_[2]; }
3376 const magnitude_type* getNormsFinal () const { return &work_[0]; }
3382 template<typename MatrixType>
3384 applyInverseJacobi(// importer
3385 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3386 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3387 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3388 const bool overlap_communication_and_computation,
3390 const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3391 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3392 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
3393 /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
3394 // local object interface
3395 const PartInterface<MatrixType> &interf, // mesh interface
3396 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3397 const AmD<MatrixType> &amd, // R = A - D
3398 /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3399 /* */ NormManager<MatrixType> &norm_manager,
3400 // preconditioner parameters
3401 const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3402 /* */ bool is_y_zero,
3403 const int max_num_sweeps,
3404 const typename ImplType<MatrixType>::magnitude_type tol,
3405 const int check_tol_every) {
3406 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi
");
3408 using impl_type = ImplType<MatrixType>;
3409 using memory_space = typename impl_type::memory_space;
3411 using local_ordinal_type = typename impl_type::local_ordinal_type;
3412 using size_type = typename impl_type::size_type;
3413 using impl_scalar_type = typename impl_type::impl_scalar_type;
3414 using magnitude_type = typename impl_type::magnitude_type;
3415 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3416 using vector_type_1d_view = typename impl_type::vector_type_1d_view;
3417 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3418 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3420 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3422 // either tpetra importer or async importer must be active
3423 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
3424 "Neither Tpetra importer nor Async importer is null.
");
3425 // max number of sweeps should be positive number
3426 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
3427 "Maximum number of sweeps must be >= 1.
");
3430 const bool is_seq_method_requested = !tpetra_importer.is_null();
3431 const bool is_async_importer_active = !async_importer.is_null();
3432 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
3433 const magnitude_type tolerance = tol*tol;
3434 const local_ordinal_type blocksize = btdm.values.extent(1);
3435 const local_ordinal_type num_vectors = Y.getNumVectors();
3436 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
3438 const impl_scalar_type zero(0.0);
3440 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
3442 "which in any
case is
for developer use only,
" <<
3443 "does not support norm-based termination.
");
3445 // if workspace is needed more, resize it
3446 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
3447 if (work.span() < work_span_required)
3448 work = vector_type_1d_view("vector workspace 1d view
", work_span_required);
3451 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
3452 if (local_ordinal_type(W.extent(0)) < W_size)
3453 W = impl_scalar_type_1d_view("W
", W_size);
3455 typename AsyncableImport<MatrixType>::impl_scalar_type_2d_view remote_multivector;
3457 if (is_seq_method_requested) {
3458 if (Z.getNumVectors() != Y.getNumVectors())
3459 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
3461 if (is_async_importer_active) {
3462 // create comm data buffer and keep it here
3463 async_importer->createDataBuffer(num_vectors);
3464 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
3469 // wrap the workspace with 3d view
3470 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
3471 const auto XX = X.template getLocalView<memory_space>();
3472 const auto YY = Y.template getLocalView<memory_space>();
3473 const auto ZZ = Z.template getLocalView<memory_space>();
3475 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
3476 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
3477 damping_factor, is_norm_manager_active);
3479 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
3480 ComputeResidualVector<MatrixType>
3481 compute_residual_vector(amd, A->getCrsGraph().getLocalGraph(), blocksize, interf,
3482 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
3484 // norm manager workspace resize
3485 if (is_norm_manager_active)
3486 norm_manager.setCheckFrequency(check_tol_every);
3490 for (;sweep<max_num_sweeps;++sweep) {
3494 multivector_converter.run(XX);
3495 Kokkos::deep_copy(YY, zero);
3497 if (is_seq_method_requested) {
3499 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
3500 compute_residual_vector.run(YY, XX, ZZ);
3502 // pmv := y(lclrow).
3503 multivector_converter.run(YY);
3505 // fused y := x - R y and pmv := y(lclrow);
3506 // real use case does not use overlap comp and comm
3507 if (overlap_communication_and_computation || !is_async_importer_active) {
3508 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
3509 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
3510 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
3511 if (is_async_importer_active) async_importer->cancel();
3514 if (is_async_importer_active) {
3515 async_importer->syncRecv();
3516 compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
3519 if (is_async_importer_active)
3520 async_importer->syncExchange(YY);
3521 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
3522 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
3528 // pmv := inv(D) pmv.
3530 solve_tridiags.run(YY, W);
3533 if (is_norm_manager_active) {
3534 // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
3535 reduceVector<MatrixType>(W, num_vectors, norm_manager.getBuffer());
3536 if (sweep + 1 == max_num_sweeps) {
3537 norm_manager.ireduce(sweep, true);
3538 norm_manager.checkDone(sweep + 1, tolerance, true);
3540 norm_manager.ireduce(sweep);
3548 //sqrt the norms for the caller's use.
3549 if (is_norm_manager_active) norm_manager.finalize();
3555 template<typename MatrixType>
3557 using impl_type = ImplType<MatrixType>;
3558 using part_interface_type = PartInterface<MatrixType>;
3559 using block_tridiags_type = BlockTridiags<MatrixType>;
3560 using amd_type = AmD<MatrixType>;
3561 using norm_manager_type = NormManager<MatrixType>;
3562 using async_import_type = AsyncableImport<MatrixType>;
3564 // distructed objects
3565 Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
3566 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
3567 Teuchos::RCP<async_import_type> async_importer;
3568 bool overlap_communication_and_computation;
3570 // copy of Y (mutable to penentrate const)
3571 mutable typename impl_type::tpetra_multivector_type Z;
3572 mutable typename impl_type::impl_scalar_type_1d_view W;
3575 part_interface_type part_interface;
3576 block_tridiags_type block_tridiags; // D
3577 amd_type a_minus_d; // R = A - D
3578 mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
3579 mutable norm_manager_type norm_manager;
3582 } // namespace BlockTriDiContainerDetails
3584 } // namespace Ifpack2
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1019
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3384
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:839
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:165
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1843
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2558
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:310
node_type::device_type device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:282
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:271
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:144
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:262
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:153
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:201
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:100
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
KokkosBatched::Experimental::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:297
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:725
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3274
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:330
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:277
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1191
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1486
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1160
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:975
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1995
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:258
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1856