43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
48 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
49 #include <Tpetra_Distributor.hpp>
50 #include <Tpetra_BlockMultiVector.hpp>
52 #include <Kokkos_ArithTraits.hpp>
53 #include <KokkosBatched_Util.hpp>
54 #include <KokkosBatched_Vector.hpp>
55 #include <KokkosBatched_Copy_Decl.hpp>
56 #include <KokkosBatched_Copy_Impl.hpp>
57 #include <KokkosBatched_AddRadial_Decl.hpp>
58 #include <KokkosBatched_AddRadial_Impl.hpp>
59 #include <KokkosBatched_SetIdentity_Decl.hpp>
60 #include <KokkosBatched_SetIdentity_Impl.hpp>
61 #include <KokkosBatched_Gemm_Decl.hpp>
62 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
63 #include <KokkosBatched_Gemm_Team_Impl.hpp>
64 #include <KokkosBatched_Gemv_Decl.hpp>
65 #include <KokkosBatched_Gemv_Serial_Impl.hpp>
66 #include <KokkosBatched_Gemv_Team_Impl.hpp>
67 #include <KokkosBatched_Trsm_Decl.hpp>
68 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
69 #include <KokkosBatched_Trsm_Team_Impl.hpp>
70 #include <KokkosBatched_Trsv_Decl.hpp>
71 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
72 #include <KokkosBatched_Trsv_Team_Impl.hpp>
73 #include <KokkosBatched_LU_Decl.hpp>
74 #include <KokkosBatched_LU_Serial_Impl.hpp>
75 #include <KokkosBatched_LU_Team_Impl.hpp>
77 #include <KokkosBlas1_nrm1.hpp>
78 #include <KokkosBlas1_nrm2.hpp>
85 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
86 #include "cuda_profiler_api.h"
91 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
102 #define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
106 #define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
109 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
110 #define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
115 namespace BlockTriDiContainerDetails {
118 #if defined(__KOKKOSBATCHED_PROMOTION__)
119 namespace KB = KokkosBatched;
121 namespace KB = KokkosBatched::Experimental;
129 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
130 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
131 MemoryTraitsType::is_random_access |
134 template <
typename ViewType>
135 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
136 typename ViewType::array_layout,
137 typename ViewType::device_type,
138 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
139 template <
typename ViewType>
140 using Atomic = Kokkos::View<
typename ViewType::data_type,
141 typename ViewType::array_layout,
142 typename ViewType::device_type,
143 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
144 template <
typename ViewType>
145 using Const = Kokkos::View<
typename ViewType::const_data_type,
146 typename ViewType::array_layout,
147 typename ViewType::device_type,
148 typename ViewType::memory_traits>;
149 template <
typename ViewType>
150 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
152 template <
typename ViewType>
153 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
155 template <
typename ViewType>
156 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
157 typename ViewType::array_layout,
158 typename ViewType::device_type,
159 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
162 template <
typename ViewType>
163 using Scratch = Kokkos::View<
typename ViewType::data_type,
164 typename ViewType::array_layout,
165 typename ViewType::execution_space::scratch_memory_space,
166 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
172 #if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
180 template<
typename T>
struct is_cuda {
enum :
bool { value =
false }; };
181 #if defined(KOKKOS_ENABLE_CUDA)
182 template<>
struct is_cuda<Kokkos::Cuda> {
enum :
bool { value =
true }; };
188 template<
typename CommPtrType>
190 const auto rank = comm->getRank();
191 const auto nranks = comm->getSize();
192 std::stringstream ss;
193 ss <<
"Rank " << rank <<
" of " << nranks <<
": ";
200 template<
typename T,
int N>
203 KOKKOS_INLINE_FUNCTION
205 for (
int i=0;i<N;++i)
208 KOKKOS_INLINE_FUNCTION
210 for (
int i=0;i<N;++i)
214 template<
typename T,
int N>
216 KOKKOS_INLINE_FUNCTION
220 for (
int i=0;i<N;++i)
223 template<
typename T,
int N>
225 KOKKOS_INLINE_FUNCTION
227 operator+=(ArrayValueType<T,N> &a,
228 const ArrayValueType<T,N> &b) {
229 for (
int i=0;i<N;++i)
236 template<
typename T,
int N,
typename ExecSpace>
240 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
243 KOKKOS_INLINE_FUNCTION
246 KOKKOS_INLINE_FUNCTION
248 for (
int i=0;i<N;++i)
249 dst.v[i] += src.v[i];
251 KOKKOS_INLINE_FUNCTION
253 for (
int i=0;i<N;++i)
254 dst.v[i] += src.v[i];
256 KOKKOS_INLINE_FUNCTION
258 for (
int i=0;i<N;++i)
259 val.v[i] = Kokkos::reduction_identity<T>::sum();
261 KOKKOS_INLINE_FUNCTION
265 KOKKOS_INLINE_FUNCTION
266 result_view_type view()
const {
267 return result_view_type(value);
271 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
272 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
274 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
277 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
278 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
279 CUDA_SAFE_CALL(cudaProfilerStart());
281 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
282 { CUDA_SAFE_CALL( cudaProfilerStop() ); }
284 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
286 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
292 template <
typename MatrixType>
298 typedef typename MatrixType::scalar_type scalar_type;
299 typedef typename MatrixType::local_ordinal_type local_ordinal_type;
300 typedef typename MatrixType::global_ordinal_type global_ordinal_type;
301 typedef typename MatrixType::node_type node_type;
307 typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
309 typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
310 typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
321 typedef typename node_device_type::execution_space node_execution_space;
322 typedef typename node_device_type::memory_space node_memory_space;
324 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE)
325 typedef node_execution_space execution_space;
327 typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
329 node_memory_space>::type memory_space;
330 typedef Kokkos::Device<execution_space,memory_space> device_type;
333 typedef node_execution_space execution_space;
334 typedef node_memory_space memory_space;
337 typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
338 typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
339 typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
340 typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
341 typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
342 typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
343 typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
344 typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_type local_crs_graph_type;
349 template<
typename T,
int l>
using Vector = KB::Vector<T,l>;
350 template<
typename T>
using SIMD = KB::SIMD<T>;
351 template<
typename T,
typename M>
using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
352 template<
typename T,
typename M>
using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
354 static constexpr
int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
355 static constexpr
int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
363 typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
365 typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
366 typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
369 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
370 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
373 typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
374 typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
375 typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
376 typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
377 typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
378 typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
384 template<
typename MatrixType>
387 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::CreateBlockCrsTpetraImporter");
389 using tpetra_map_type =
typename impl_type::tpetra_map_type;
390 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
391 using tpetra_import_type =
typename impl_type::tpetra_import_type;
393 const auto g = A->getCrsGraph();
394 const auto blocksize = A->getBlockSize();
395 const auto src =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
396 const auto tgt =
Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
406 template<
typename MatrixType>
407 struct AsyncableImport {
409 using impl_type = ImplType<MatrixType>;
415 #if !defined(HAVE_IFPACK2_MPI)
416 typedef int MPI_Request;
417 typedef int MPI_Comm;
419 using scalar_type =
typename impl_type::scalar_type;
423 static int isend(
const MPI_Comm comm,
const char* buf,
int count,
int dest,
int tag, MPI_Request* ireq) {
424 #ifdef HAVE_IFPACK2_MPI
426 int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
427 if (ireq == NULL) MPI_Request_free(&ureq);
434 static int irecv(
const MPI_Comm comm,
char* buf,
int count,
int src,
int tag, MPI_Request* ireq) {
435 #ifdef HAVE_IFPACK2_MPI
437 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
438 if (ireq == NULL) MPI_Request_free(&ureq);
445 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
446 struct CallbackDataArgsForSendRecv {
457 static void CUDART_CB callback_isend(cudaStream_t this_stream, cudaError_t status,
void *data) {
458 CallbackDataArgsForSendRecv *in =
reinterpret_cast<CallbackDataArgsForSendRecv*
>(data);
459 in->r_val =
isend(in->comm, in->buf, in->count, in->pid, in->tag, in->ireq);
462 static void CUDART_CB callback_irecv(cudaStream_t this_stream, cudaError_t status,
void *data) {
463 CallbackDataArgsForSendRecv *in =
reinterpret_cast<CallbackDataArgsForSendRecv*
>(data);
464 in->r_val = irecv(in->comm, in->buf, in->count, in->pid, in->tag, in->ireq);
467 static void CUDART_CB callback_wait(cudaStream_t this_stream, cudaError_t status,
void *data) {
468 #ifdef HAVE_IFPACK2_MPI
469 MPI_Request *req =
reinterpret_cast<MPI_Request*
>(data);
470 MPI_Wait(req, MPI_STATUS_IGNORE);
471 #endif // HAVE_IFPACK2_MPI
473 #endif // defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
475 static int waitany(
int count, MPI_Request* reqs,
int* index) {
476 #ifdef HAVE_IFPACK2_MPI
477 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
483 static int waitall(
int count, MPI_Request* reqs) {
484 #ifdef HAVE_IFPACK2_MPI
485 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
492 using tpetra_map_type =
typename impl_type::tpetra_map_type;
493 using tpetra_import_type =
typename impl_type::tpetra_import_type;
495 using local_ordinal_type =
typename impl_type::local_ordinal_type;
496 using global_ordinal_type =
typename impl_type::global_ordinal_type;
500 using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
501 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
503 using execution_space =
typename impl_type::execution_space;
504 using memory_space =
typename impl_type::memory_space;
505 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
507 using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
509 #if defined(KOKKOS_ENABLE_CUDA)
510 using impl_scalar_type_1d_view =
511 typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
512 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
513 Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
514 # elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
515 Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
516 # else // no experimental macros are defined
517 typename impl_type::impl_scalar_type_1d_view,
519 typename impl_type::impl_scalar_type_1d_view>::type;
521 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
523 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
524 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
526 #ifdef HAVE_IFPACK2_MPI
530 impl_scalar_type_2d_view_tpetra remote_multivector;
531 local_ordinal_type blocksize;
534 struct SendRecvPair {
539 SendRecvPair<int_1d_view_host> pids;
540 SendRecvPair<std::vector<MPI_Request> > reqs;
541 SendRecvPair<size_type_1d_view> offset;
542 SendRecvPair<size_type_1d_view_host> offset_host;
543 SendRecvPair<local_ordinal_type_1d_view> lids;
544 SendRecvPair<impl_scalar_type_1d_view> buffer;
546 local_ordinal_type_1d_view dm2cm;
548 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
549 using cuda_stream_1d_std_vector = std::vector<cudaStream_t>;
550 SendRecvPair<cuda_stream_1d_std_vector> stream;
552 using callback_data_send_recv_1d_std_vector = std::vector<CallbackDataArgsForSendRecv>;
553 SendRecvPair<callback_data_send_recv_1d_std_vector> callback_data;
559 const size_type_1d_view &offs) {
561 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.
getRawPtr()), lens.
size());
562 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
565 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
566 const local_ordinal_type lens_size = lens_device.extent(0);
567 Kokkos::parallel_scan
568 (
"AsyncableImport::RangePolicy::setOffsetValues",
569 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
572 update += (i < lens_size ? lens_device[i] : 0);
577 void createMpiRequests(
const tpetra_import_type &
import) {
578 Tpetra::Distributor &distributor =
import.getDistributor();
581 const auto pids_from = distributor.getProcsFrom();
583 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(int)*pids.recv.extent(0));
585 const auto pids_to = distributor.getProcsTo();
587 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(int)*pids.send.extent(0));
590 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*
sizeof(MPI_Request));
591 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*
sizeof(MPI_Request));
594 const auto lengths_to = distributor.getLengthsTo();
597 const auto lengths_from = distributor.getLengthsFrom();
600 setOffsetValues(lengths_to, offset.send);
601 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
603 setOffsetValues(lengths_from, offset.recv);
604 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
607 void createSendRecvIDs(
const tpetra_import_type &
import) {
609 const auto remote_lids =
import.getRemoteLIDs();
610 const local_ordinal_type_1d_view_host
611 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
613 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
616 auto epids =
import.getExportPIDs();
617 auto elids =
import.getExportLIDs();
620 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
623 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
624 const auto pid_send_value = pids.send[i];
625 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
626 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
627 #if !defined(__CUDA_ARCH__)
628 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
631 Kokkos::deep_copy(lids.send, lids_send_host);
637 struct ToMultiVector {};
641 const local_ordinal_type blocksize_,
642 const local_ordinal_type_1d_view dm2cm_) {
643 blocksize = blocksize_;
646 #ifdef HAVE_IFPACK2_MPI
647 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
649 const tpetra_import_type
import(src_map, tgt_map);
651 createMpiRequests(
import);
652 createSendRecvIDs(
import);
654 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
656 const local_ordinal_type num_streams = pids.send.extent(0);
657 stream.send.resize(num_streams);
658 callback_data.send.resize(num_streams);
659 for (local_ordinal_type i=0;i<num_streams;++i)
660 CUDA_SAFE_CALL(cudaStreamCreate(&stream.send[i]));
663 const local_ordinal_type num_streams = pids.recv.extent(0);
664 stream.recv.resize(num_streams);
665 callback_data.recv.resize(num_streams);
666 for (local_ordinal_type i=0;i<num_streams;++i)
667 CUDA_SAFE_CALL(cudaStreamCreate(&stream.recv[i]));
673 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
675 const local_ordinal_type num_streams = stream.send.size();
676 for (local_ordinal_type i=0;i<num_streams;++i)
677 CUDA_SAFE_CALL(cudaStreamDestroy(stream.send[i]));
680 const local_ordinal_type num_streams = stream.recv.size();
681 for (local_ordinal_type i=0;i<num_streams;++i)
682 CUDA_SAFE_CALL(cudaStreamDestroy(stream.recv[i]));
687 void createDataBuffer(
const local_ordinal_type &num_vectors) {
688 const size_type extent_0 = lids.recv.extent(0)*blocksize;
689 const size_type extent_1 = num_vectors;
690 if (remote_multivector.extent(0) == extent_0 &&
691 remote_multivector.extent(1) == extent_1) {
697 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
698 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
706 #ifdef HAVE_IFPACK2_MPI
707 waitall(reqs.recv.size(), reqs.recv.data());
708 waitall(reqs.send.size(), reqs.send.data());
717 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
718 template<
typename PackTag>
720 void copyViaCudaStream(
const local_ordinal_type_1d_view &lids_,
721 const impl_scalar_type_1d_view &buffer_,
722 const local_ordinal_type ibeg_,
723 const local_ordinal_type iend_,
724 const impl_scalar_type_2d_view_tpetra &multivector_,
725 const local_ordinal_type blocksize_,
726 const cudaStream_t &stream_) {
727 const local_ordinal_type num_vectors = multivector_.extent(1);
728 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
729 const local_ordinal_type idiff = iend_ - ibeg_;
730 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
732 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
733 local_ordinal_type vector_size(0);
734 if (blocksize_ <= 4) vector_size = 4;
735 else if (blocksize_ <= 8) vector_size = 8;
736 else if (blocksize_ <= 16) vector_size = 16;
737 else vector_size = 32;
739 const auto exec_instance = Kokkos::Cuda(stream_);
740 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
741 const team_policy_type policy(exec_instance, idiff, 1, vector_size);
744 Kokkos::Experimental::require(policy, work_item_property),
745 KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
746 const local_ordinal_type i = member.league_rank();
748 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
749 auto aptr = abase + blocksize_*(i + idiff*j);
750 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
751 if (std::is_same<PackTag,ToBuffer>::value)
753 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
758 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
765 void asyncSendRecvViaCudaStream(
const impl_scalar_type_2d_view_tpetra &mv) {
766 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv");
768 #ifdef HAVE_IFPACK2_MPI
770 const local_ordinal_type num_vectors = mv.extent(1);
771 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
774 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
776 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
777 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
784 #if defined (KOKKOS_ENABLE_OPENMP)
785 #pragma omp parallel for
787 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
788 auto &stream_at_i = stream.send[i];
790 copyViaCudaStream<ToBuffer>(lids.send, buffer.send,
791 offset_host.send(i), offset_host.send(i+1),
808 #if defined (KOKKOS_ENABLE_OPENMP)
809 #pragma omp parallel for
811 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
815 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
816 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
823 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
826 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
828 #endif // HAVE_IFPACK2_MPI
831 void syncRecvViaCudaStream() {
832 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv");
833 #ifdef HAVE_IFPACK2_MPI
835 #if defined (KOKKOS_ENABLE_OPENMP)
836 #pragma omp parallel for
838 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
839 local_ordinal_type idx = i;
840 auto &stream_at_idx = stream.recv[idx];
843 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
849 copyViaCudaStream<ToMultiVector>(lids.recv, buffer.recv,
850 offset_host.recv(idx), offset_host.recv(idx+1),
851 remote_multivector, blocksize,
859 waitall(reqs.send.size(), reqs.send.data());
860 #endif // HAVE_IFPACK2_MPI
862 #endif //defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
869 template<
typename PackTag>
871 void copy(
const local_ordinal_type_1d_view &lids_,
872 const impl_scalar_type_1d_view &buffer_,
873 const local_ordinal_type &ibeg_,
874 const local_ordinal_type &iend_,
875 const impl_scalar_type_2d_view_tpetra &multivector_,
876 const local_ordinal_type blocksize_) {
877 const local_ordinal_type num_vectors = multivector_.extent(1);
878 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
879 const local_ordinal_type idiff = iend_ - ibeg_;
880 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
881 if (is_cuda<execution_space>::value) {
882 #if defined(KOKKOS_ENABLE_CUDA)
883 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
884 local_ordinal_type vector_size(0);
885 if (blocksize_ <= 4) vector_size = 4;
886 else if (blocksize_ <= 8) vector_size = 8;
887 else if (blocksize_ <= 16) vector_size = 16;
888 else vector_size = 32;
889 const team_policy_type policy(idiff, 1, vector_size);
891 (
"AsyncableImport::TeamPolicy::copy",
892 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
893 const local_ordinal_type i = member.league_rank();
895 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
896 auto aptr = abase + blocksize_*(i + idiff*j);
897 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
898 if (std::is_same<PackTag,ToBuffer>::value)
900 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
905 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
912 #if defined(__CUDA_ARCH__)
916 const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
918 (
"AsyncableImport::RangePolicy::copy",
919 policy, KOKKOS_LAMBDA(
const local_ordinal_type &ij) {
920 const local_ordinal_type i = ij%idiff;
921 const local_ordinal_type j = ij/idiff;
922 auto aptr = abase + blocksize_*(i + idiff*j);
923 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
924 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
925 auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
926 memcpy(to, from,
sizeof(impl_scalar_type)*blocksize_);
934 void asyncSendRecv(
const impl_scalar_type_2d_view_tpetra &mv) {
935 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::AsyncSendRecv");
937 #ifdef HAVE_IFPACK2_MPI
939 const local_ordinal_type num_vectors = mv.extent(1);
940 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
943 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
945 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
946 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*
sizeof(impl_scalar_type),
953 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
954 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
957 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
958 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*
sizeof(impl_scalar_type),
966 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
969 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
975 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::SyncRecv");
976 #ifdef HAVE_IFPACK2_MPI
978 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
979 local_ordinal_type idx = i;
980 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
981 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
982 remote_multivector, blocksize);
985 waitall(reqs.send.size(), reqs.send.data());
989 void syncExchange(
const impl_scalar_type_2d_view_tpetra &mv) {
990 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::AsyncableImport::SyncExchange");
991 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_STREAM)
992 asyncSendRecvViaCudaStream(mv);
993 syncRecvViaCudaStream();
1000 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
1006 template<
typename MatrixType>
1010 using tpetra_map_type =
typename impl_type::tpetra_map_type;
1011 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1012 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1013 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1015 const auto g = A->getCrsGraph();
1016 const auto blocksize = A->getBlockSize();
1017 const auto domain_map = g.getDomainMap();
1018 const auto column_map = g.getColMap();
1020 std::vector<global_ordinal_type> gids;
1021 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
1022 for (
size_t i=0;i<column_map->getNodeNumElements();++i) {
1023 const global_ordinal_type gid = column_map->getGlobalElement(i);
1024 if (!domain_map->isNodeGlobalElement(gid)) {
1026 gids.push_back(gid);
1027 }
else if (found_first) {
1028 separate_remotes =
false;
1031 if (!need_owned_permutation &&
1032 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
1041 need_owned_permutation =
true;
1045 if (separate_remotes) {
1047 const auto parsimonious_col_map
1048 =
Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
1049 if (parsimonious_col_map->getGlobalNumElements() > 0) {
1051 local_ordinal_type_1d_view dm2cm;
1052 if (need_owned_permutation) {
1053 dm2cm = local_ordinal_type_1d_view(
do_not_initialize_tag(
"dm2cm"), domain_map->getNodeNumElements());
1054 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
1055 for (
size_t i=0;i<domain_map->getNodeNumElements();++i)
1056 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
1057 Kokkos::deep_copy(dm2cm, dm2cm_host);
1059 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
1062 return Teuchos::null;
1065 template<
typename MatrixType>
1066 struct PartInterface {
1067 using local_ordinal_type =
typename ImplType<MatrixType>::local_ordinal_type;
1068 using local_ordinal_type_1d_view =
typename ImplType<MatrixType>::local_ordinal_type_1d_view;
1070 PartInterface() =
default;
1071 PartInterface(
const PartInterface &b) =
default;
1091 local_ordinal_type_1d_view lclrow;
1093 local_ordinal_type_1d_view partptr;
1096 local_ordinal_type_1d_view packptr;
1099 local_ordinal_type_1d_view part2rowidx0;
1103 local_ordinal_type_1d_view part2packrowidx0;
1104 local_ordinal_type part2packrowidx0_back;
1106 local_ordinal_type_1d_view rowidx2part;
1112 bool row_contiguous;
1114 local_ordinal_type max_partsz;
1120 template<
typename MatrixType>
1121 PartInterface<MatrixType>
1125 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1126 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1128 constexpr
int vector_length = impl_type::vector_length;
1130 const auto comm = A->getRowMap()->getComm();
1132 PartInterface<MatrixType> interf;
1134 const bool jacobi = partitions.size() == 0;
1135 const local_ordinal_type A_n_lclrows = A->getNodeNumRows();
1136 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1138 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1139 local_ordinal_type nrows = 0;
1143 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1146 (nrows != A_n_lclrows,
get_msg_prefix(comm) <<
"The #rows implied by the local partition is not "
1147 <<
"the same as getNodeNumRows: " << nrows <<
" vs " << A_n_lclrows);
1151 std::vector<local_ordinal_type> p;
1153 interf.max_partsz = 1;
1158 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1159 std::vector<size_idx_pair_type> partsz(nparts);
1160 for (local_ordinal_type i=0;i<nparts;++i)
1161 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1162 std::sort(partsz.begin(), partsz.end(),
1163 [] (
const size_idx_pair_type& x,
const size_idx_pair_type& y) {
1164 return x.first > y.first;
1166 for (local_ordinal_type i=0;i<nparts;++i)
1167 p[i] = partsz[i].second;
1169 interf.max_partsz = partsz[0].first;
1175 interf.part2rowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2rowidx0"), nparts + 1);
1176 interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
1180 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1181 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1182 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1183 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1184 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1187 interf.row_contiguous =
true;
1189 part2rowidx0(0) = 0;
1190 part2packrowidx0(0) = 0;
1191 local_ordinal_type pack_nrows = 0;
1192 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1193 const auto* part = jacobi ? NULL : &partitions[p[ip]];
1194 const local_ordinal_type ipnrows = jacobi ? 1 : part->size();
1195 TEUCHOS_ASSERT(ip == 0 || (jacobi || ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1198 <<
"partition " << p[ip]
1199 <<
" is empty, which is not allowed.");
1201 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1204 if (ip % vector_length == 0) pack_nrows = ipnrows;
1205 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1206 const local_ordinal_type os = partptr(ip);
1207 for (local_ordinal_type i=0;i<ipnrows;++i) {
1208 const auto lcl_row = jacobi ? ip : (*part)[i];
1211 <<
"partitions[" << p[ip] <<
"]["
1212 << i <<
"] = " << lcl_row
1213 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
1215 lclrow(os+i) = lcl_row;
1216 rowidx2part(os+i) = ip;
1217 if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
1218 interf.row_contiguous =
false;
1220 partptr(ip+1) = os + ipnrows;
1222 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1225 if (lclrow(0) != 0) interf.row_contiguous =
false;
1227 Kokkos::deep_copy(interf.partptr, partptr);
1228 Kokkos::deep_copy(interf.lclrow, lclrow);
1231 interf.part2rowidx0 = interf.partptr;
1232 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1234 interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
1235 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1238 local_ordinal_type npacks = 0;
1239 for (local_ordinal_type ip=1;ip<=nparts;++ip)
1240 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1243 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1245 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1246 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1248 Kokkos::deep_copy(interf.packptr, packptr);
1257 template <
typename MatrixType>
1260 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1262 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1268 size_type_1d_view flat_td_ptr, pack_td_ptr;
1271 local_ordinal_type_1d_view A_colindsub;
1274 vector_type_3d_view values;
1276 bool is_diagonal_only;
1282 template <
typename idx_type>
1283 static KOKKOS_FORCEINLINE_FUNCTION
1284 idx_type IndexToRow (
const idx_type& ind) {
return (ind + 1) / 3; }
1287 template <
typename idx_type>
1288 static KOKKOS_FORCEINLINE_FUNCTION
1289 idx_type RowToIndex (
const idx_type& row) {
return row > 0 ? 3*row - 1 : 0; }
1291 template <
typename idx_type>
1292 static KOKKOS_FORCEINLINE_FUNCTION
1293 idx_type NumBlocks (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows - 2 : 0; }
1300 template<
typename MatrixType>
1304 using execution_space =
typename impl_type::execution_space;
1305 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1306 using size_type =
typename impl_type::size_type;
1307 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1309 constexpr
int vector_length = impl_type::vector_length;
1313 const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
1317 const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
1318 Kokkos::parallel_scan
1319 (
"createBlockTridiags::RangePolicy::flat_td_ptr",
1320 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1322 btdm.flat_td_ptr(i) = update;
1323 if (i < ntridiags) {
1324 const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
1325 update += btdm.NumBlocks(nrows);
1329 const auto nblocks = Kokkos::create_mirror_view_and_copy
1330 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
1331 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
1335 if (vector_length == 1) {
1336 btdm.pack_td_ptr = btdm.flat_td_ptr;
1338 const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
1340 const Kokkos::RangePolicy<execution_space> policy(0,npacks);
1341 Kokkos::parallel_scan
1342 (
"createBlockTridiags::RangePolicy::pack_td_ptr",
1343 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
1344 const local_ordinal_type parti = interf.packptr(i);
1345 const local_ordinal_type parti_next = interf.packptr(i+1);
1347 const size_type nblks = update;
1348 for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1349 btdm.pack_td_ptr(pti) = nblks;
1350 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1353 btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1356 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1357 update += btdm.NumBlocks(nrows);
1377 template<
typename MatrixType>
1379 setTridiagsToIdentity
1380 (
const BlockTridiags<MatrixType>& btdm,
1381 const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1383 using impl_type = ImplType<MatrixType>;
1384 using execution_space =
typename impl_type::execution_space;
1385 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1386 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1388 const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1389 const local_ordinal_type blocksize = btdm.values.extent(1);
1392 const int vector_length = impl_type::vector_length;
1393 const int internal_vector_length = impl_type::internal_vector_length;
1395 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
1396 using internal_vector_type =
typename impl_type::internal_vector_type;
1397 using internal_vector_type_4d_view =
1398 typename impl_type::internal_vector_type_4d_view;
1400 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1401 const internal_vector_type_4d_view values
1402 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1403 btdm.values.extent(0),
1404 btdm.values.extent(1),
1405 btdm.values.extent(2),
1406 vector_length/internal_vector_length);
1407 const local_ordinal_type vector_loop_size = values.extent(3);
1408 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1409 local_ordinal_type total_team_size(0);
1410 if (blocksize <= 5) total_team_size = 32;
1411 else if (blocksize <= 9) total_team_size = 64;
1412 else if (blocksize <= 12) total_team_size = 96;
1413 else if (blocksize <= 16) total_team_size = 128;
1414 else if (blocksize <= 20) total_team_size = 160;
1415 else total_team_size = 160;
1416 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1417 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1418 #else // Host architecture: team size is always one
1419 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1421 Kokkos::parallel_for
1422 (
"setTridiagsToIdentity::TeamPolicy",
1423 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1424 const local_ordinal_type k = member.league_rank();
1425 const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1426 const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1427 const local_ordinal_type diff = iend - ibeg;
1428 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1429 const btdm_scalar_type one(1);
1430 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
1431 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](
const local_ordinal_type &ii) {
1432 const local_ordinal_type i = ibeg + ii*3;
1433 for (local_ordinal_type j=0;j<blocksize;++j)
1434 values(i,j,j,v) = one;
1444 template <
typename MatrixType>
1447 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1449 using impl_scalar_type_1d_view_tpetra =
typename impl_type::impl_scalar_type_1d_view_tpetra;
1452 size_type_1d_view rowptr, rowptr_remote;
1459 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1462 bool is_tpetra_block_crs;
1465 impl_scalar_type_1d_view_tpetra tpetra_values;
1468 AmD(
const AmD &b) =
default;
1474 template<
typename MatrixType>
1477 const PartInterface<MatrixType> &interf,
1480 const bool overlap_communication_and_computation) {
1481 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::SymbolicPhase");
1484 using node_memory_space =
typename impl_type::node_memory_space;
1485 using host_execution_space =
typename impl_type::host_execution_space;
1487 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1488 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1489 using size_type =
typename impl_type::size_type;
1490 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1491 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1492 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1493 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1495 constexpr
int vector_length = impl_type::vector_length;
1497 const auto comm = A->getRowMap()->getComm();
1498 const auto& g = A->getCrsGraph();
1499 const auto blocksize = A->getBlockSize();
1502 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1503 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1504 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1505 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1506 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1508 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1511 Kokkos::View<local_ordinal_type*,host_execution_space> col2row(
"col2row", A->getNodeNumCols());
1514 const auto rowmap = g.getRowMap();
1515 const auto colmap = g.getColMap();
1516 const auto dommap = g.getDomainMap();
1517 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1519 #if !defined(__CUDA_ARCH__)
1520 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1521 Kokkos::parallel_for
1522 (
"performSymbolicPhase::RangePolicy::col2row",
1523 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1524 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1526 if (dommap->isNodeGlobalElement(gid)) {
1527 const local_ordinal_type lc = colmap->getLocalElement(gid);
1528 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1531 <<
" gives an invalid local column.");
1541 const auto& local_graph = g.getLocalGraph();
1542 const auto& local_graph_rowptr = local_graph.row_map;
1543 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
1544 const auto& local_graph_colidx = local_graph.entries;
1548 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
1550 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1551 Kokkos::parallel_for
1552 (
"performSymbolicPhase::RangePolicy::lclrow2idx",
1553 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1554 lclrow2idx[lclrow(i)] = i;
1560 typename sum_reducer_type::value_type sum_reducer_value;
1562 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1563 Kokkos::parallel_reduce
1566 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
typename sum_reducer_type::value_type &update) {
1568 const local_ordinal_type ri0 = lclrow2idx[lr];
1569 const local_ordinal_type pi0 = rowidx2part(ri0);
1570 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1571 const local_ordinal_type lc = local_graph_colidx(j);
1572 const local_ordinal_type lc2r = col2row[lc];
1573 bool incr_R =
false;
1579 const local_ordinal_type ri = lclrow2idx[lc2r];
1580 const local_ordinal_type pi = rowidx2part(ri);
1588 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1594 if (lc < nrows) ++update.v[1];
1598 }, sum_reducer_type(sum_reducer_value));
1600 size_type D_nnz = sum_reducer_value.v[0];
1601 size_type R_nnz_owned = sum_reducer_value.v[1];
1602 size_type R_nnz_remote = sum_reducer_value.v[2];
1604 if (!overlap_communication_and_computation) {
1605 R_nnz_owned += R_nnz_remote;
1611 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1613 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
1614 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1616 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1620 const local_ordinal_type nparts = partptr.extent(0) - 1;
1622 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1623 Kokkos::parallel_for
1624 (
"performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1625 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
1626 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1627 local_ordinal_type offset = 0;
1628 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1629 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1631 const local_ordinal_type lr0 = lclrow(ri0);
1632 const size_type j0 = local_graph_rowptr(lr0);
1633 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1634 const local_ordinal_type lc = local_graph_colidx(j);
1635 const local_ordinal_type lc2r = col2row[lc];
1637 const local_ordinal_type ri = lclrow2idx[lc2r];
1638 const local_ordinal_type pi = rowidx2part(ri);
1639 if (pi != pi0)
continue;
1640 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
1641 const local_ordinal_type row_entry = j - j0;
1642 D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1647 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1648 for (
size_t i=0;i<D_A_colindsub.extent(0);++i)
1651 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1655 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1656 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1657 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
1658 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1664 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
1665 amd.A_colindsub = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub"), R_nnz_owned);
1667 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1668 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1670 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1671 amd.A_colindsub_remote = local_ordinal_type_1d_view(
do_not_initialize_tag(
"amd.A_colindsub_remote"), R_nnz_remote);
1673 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1674 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1677 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1678 Kokkos::parallel_for
1679 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1680 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1681 const local_ordinal_type ri0 = lclrow2idx[lr];
1682 const local_ordinal_type pi0 = rowidx2part(ri0);
1683 const size_type j0 = local_graph_rowptr(lr);
1684 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1685 const local_ordinal_type lc = local_graph_colidx(j);
1686 const local_ordinal_type lc2r = col2row[lc];
1688 const local_ordinal_type ri = lclrow2idx[lc2r];
1689 const local_ordinal_type pi = rowidx2part(ri);
1690 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1695 if (!overlap_communication_and_computation || lc < nrows) {
1698 ++R_rowptr_remote(lr);
1707 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1708 Kokkos::parallel_scan
1709 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1710 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
1711 update_type &update,
1712 const bool &
final) {
1714 val.v[0] = R_rowptr(lr);
1715 if (overlap_communication_and_computation)
1716 val.v[1] = R_rowptr_remote(lr);
1719 R_rowptr(lr) = update.v[0];
1720 if (overlap_communication_and_computation)
1721 R_rowptr_remote(lr) = update.v[1];
1724 const local_ordinal_type ri0 = lclrow2idx[lr];
1725 const local_ordinal_type pi0 = rowidx2part(ri0);
1727 size_type cnt_rowptr = R_rowptr(lr);
1728 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
1730 const size_type j0 = local_graph_rowptr(lr);
1731 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1732 const local_ordinal_type lc = local_graph_colidx(j);
1733 const local_ordinal_type lc2r = col2row[lc];
1735 const local_ordinal_type ri = lclrow2idx[lc2r];
1736 const local_ordinal_type pi = rowidx2part(ri);
1737 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1740 const local_ordinal_type row_entry = j - j0;
1741 if (!overlap_communication_and_computation || lc < nrows)
1742 R_A_colindsub(cnt_rowptr++) = row_entry;
1744 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1752 Kokkos::deep_copy(amd.rowptr, R_rowptr);
1753 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1754 if (overlap_communication_and_computation) {
1756 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1757 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1761 amd.tpetra_values = (
const_cast<block_crs_matrix_type*
>(A.get())->
1762 template getValues<node_memory_space>());
1771 template<
typename ArgActiveExecutionMemorySpace>
1776 typedef KB::Mode::Serial mode_type;
1777 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
1778 typedef KB::Algo::Level3::CompactMKL algo_type;
1780 typedef KB::Algo::Level3::Blocked algo_type;
1782 static int recommended_team_size(
const int ,
1790 #if defined(KOKKOS_ENABLE_CUDA)
1791 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(
const int blksize,
1792 const int vector_length,
1793 const int internal_vector_length) {
1794 const int vector_size = vector_length/internal_vector_length;
1795 int total_team_size(0);
1796 if (blksize <= 5) total_team_size = 32;
1797 else if (blksize <= 9) total_team_size = 32;
1798 else if (blksize <= 12) total_team_size = 96;
1799 else if (blksize <= 16) total_team_size = 128;
1800 else if (blksize <= 20) total_team_size = 160;
1801 else total_team_size = 160;
1802 return 2*total_team_size/vector_size;
1805 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
1806 typedef KB::Mode::Team mode_type;
1807 typedef KB::Algo::Level3::Unblocked algo_type;
1808 static int recommended_team_size(
const int blksize,
1809 const int vector_length,
1810 const int internal_vector_length) {
1811 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1815 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
1816 typedef KB::Mode::Team mode_type;
1817 typedef KB::Algo::Level3::Unblocked algo_type;
1818 static int recommended_team_size(
const int blksize,
1819 const int vector_length,
1820 const int internal_vector_length) {
1821 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1826 template<
typename MatrixType>
1827 struct ExtractAndFactorizeTridiags {
1829 using impl_type = ImplType<MatrixType>;
1831 using execution_space =
typename impl_type::execution_space;
1832 using memory_space =
typename impl_type::memory_space;
1834 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1837 using magnitude_type =
typename impl_type::magnitude_type;
1839 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1841 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1843 using impl_scalar_type_1d_view_tpetra =
typename impl_type::impl_scalar_type_1d_view_tpetra;
1845 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
1846 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
1847 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1848 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
1849 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
1850 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
1851 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
1853 using internal_vector_type =
typename impl_type::internal_vector_type;
1854 static constexpr
int vector_length = impl_type::vector_length;
1855 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
1858 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1859 using member_type =
typename team_policy_type::member_type;
1863 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1864 const local_ordinal_type max_partsz;
1866 using a_rowptr_value_type =
typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::node_device_type>::size_type;
1867 using size_type_1d_view_tpetra = Kokkos::View<a_rowptr_value_type*,typename impl_type::node_device_type>;
1868 const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
1869 const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
1871 const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1872 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1873 const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
1874 const Unmanaged<btdm_scalar_type_4d_view> scalar_values;
1876 const local_ordinal_type blocksize, blocksize_square;
1878 const magnitude_type tiny;
1879 const local_ordinal_type vector_loop_size;
1880 const local_ordinal_type vector_length_value;
1883 ExtractAndFactorizeTridiags(
const BlockTridiags<MatrixType> &btdm_,
1884 const PartInterface<MatrixType> &interf_,
1886 const magnitude_type& tiny_) :
1888 partptr(interf_.partptr),
1889 lclrow(interf_.lclrow),
1890 packptr(interf_.packptr),
1891 max_partsz(interf_.max_partsz),
1893 A_rowptr(A_->getCrsGraph().getLocalGraph().row_map),
1894 A_values(const_cast<block_crs_matrix_type*>(A_.get())->template getValues<memory_space>()),
1896 pack_td_ptr(btdm_.pack_td_ptr),
1897 flat_td_ptr(btdm_.flat_td_ptr),
1898 A_colindsub(btdm_.A_colindsub),
1899 internal_vector_values((internal_vector_type*)btdm_.values.data(),
1900 btdm_.values.extent(0),
1901 btdm_.values.extent(1),
1902 btdm_.values.extent(2),
1903 vector_length/internal_vector_length),
1904 scalar_values((btdm_scalar_type*)btdm_.values.data(),
1905 btdm_.values.extent(0),
1906 btdm_.values.extent(1),
1907 btdm_.values.extent(2),
1909 blocksize(btdm_.values.extent(1)),
1910 blocksize_square(blocksize*blocksize),
1913 vector_loop_size(vector_length/internal_vector_length),
1914 vector_length_value(vector_length) {}
1918 KOKKOS_INLINE_FUNCTION
1920 extract(local_ordinal_type partidx,
1921 local_ordinal_type npacks)
const {
1922 const size_type kps = pack_td_ptr(partidx);
1923 local_ordinal_type kfs[vector_length] = {};
1924 local_ordinal_type ri0[vector_length] = {};
1925 local_ordinal_type nrows[vector_length] = {};
1927 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
1928 kfs[vi] = flat_td_ptr(partidx);
1929 ri0[vi] = partptr(partidx);
1930 nrows[vi] = partptr(partidx+1) - ri0[vi];
1932 for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
1933 for (local_ordinal_type e=0;e<3;++e) {
1934 const impl_scalar_type* block[vector_length] = {};
1935 for (local_ordinal_type vi=0;vi<npacks;++vi) {
1936 const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
1937 block[vi] = &A_values(Aj*blocksize_square);
1939 const size_type pi = kps + j;
1941 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
1942 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
1943 const auto idx = ii*blocksize + jj;
1944 auto& v = internal_vector_values(pi, ii, jj, 0);
1945 for (local_ordinal_type vi=0;vi<npacks;++vi)
1946 v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
1950 if (nrows[0] == 1)
break;
1951 if (e == 1 && (tr == 0 || tr+1 == nrows[0]))
break;
1952 for (local_ordinal_type vi=1;vi<npacks;++vi) {
1953 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
1962 KOKKOS_INLINE_FUNCTION
1964 extract(
const member_type &member,
1965 const local_ordinal_type &partidxbeg,
1966 const local_ordinal_type &npacks,
1967 const local_ordinal_type &vbeg)
const {
1968 local_ordinal_type kfs_vals[internal_vector_length] = {};
1969 local_ordinal_type ri0_vals[internal_vector_length] = {};
1970 local_ordinal_type nrows_vals[internal_vector_length] = {};
1972 const size_type kps = pack_td_ptr(partidxbeg);
1973 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
1974 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
1975 ri0_vals[vi] = partptr(partidxbeg+vi);
1976 nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
1979 local_ordinal_type j_vals[internal_vector_length] = {};
1980 for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
1981 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
1982 const local_ordinal_type nrows = nrows_vals[vi];
1984 auto &j = j_vals[vi];
1985 const local_ordinal_type kfs = kfs_vals[vi];
1986 const local_ordinal_type ri0 = ri0_vals[vi];
1987 const local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
1988 const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
1989 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
1990 const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
1991 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
1992 const size_type pi = kps + j;
1993 Kokkos::parallel_for
1994 (Kokkos::TeamThreadRange(member,blocksize),
1995 [&](
const local_ordinal_type &ii) {
1996 for (local_ordinal_type jj=0;jj<blocksize;++jj)
1997 scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[ii*blocksize + jj]);
2005 template<
typename AAViewType,
2006 typename WWViewType>
2007 KOKKOS_INLINE_FUNCTION
2009 factorize(
const member_type &member,
2010 const local_ordinal_type &i0,
2011 const local_ordinal_type &nrows,
2012 const local_ordinal_type &v,
2013 const AAViewType &AA,
2014 const WWViewType &WW)
const {
2015 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
2016 <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2017 typedef default_mode_and_algo_type::mode_type default_mode_type;
2018 typedef default_mode_and_algo_type::algo_type default_algo_type;
2021 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2024 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2026 default_mode_type,KB::Algo::LU::Unblocked>
2027 ::invoke(member, A , tiny);
2031 local_ordinal_type i = i0;
2032 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2033 B.assign_data( &AA(i+1,0,0,v) );
2034 KB::Trsm<member_type,
2035 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2036 default_mode_type,default_algo_type>
2037 ::invoke(member, one, A, B);
2038 C.assign_data( &AA(i+2,0,0,v) );
2039 KB::Trsm<member_type,
2040 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2041 default_mode_type,default_algo_type>
2042 ::invoke(member, one, A, C);
2043 A.assign_data( &AA(i+3,0,0,v) );
2044 KB::Gemm<member_type,
2045 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2046 default_mode_type,default_algo_type>
2047 ::invoke(member, -one, C, B, one, A);
2049 default_mode_type,KB::Algo::LU::Unblocked>
2050 ::invoke(member, A, tiny);
2054 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2055 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2056 ::invoke(member, A, W);
2057 KB::SetIdentity<member_type,default_mode_type>
2058 ::invoke(member, A);
2059 member.team_barrier();
2060 KB::Trsm<member_type,
2061 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2062 default_mode_type,default_algo_type>
2063 ::invoke(member, one, W, A);
2064 KB::Trsm<member_type,
2065 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2066 default_mode_type,default_algo_type>
2067 ::invoke(member, one, W, A);
2073 struct ExtractAndFactorizeTag {};
2075 KOKKOS_INLINE_FUNCTION
2077 operator() (
const ExtractAndFactorizeTag &,
const member_type &member)
const {
2079 const local_ordinal_type packidx = member.league_rank();
2081 const local_ordinal_type partidx = packptr(packidx);
2082 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2083 const local_ordinal_type i0 = pack_td_ptr(partidx);
2084 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2086 internal_vector_scratch_type_3d_view
2087 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
2088 if (vector_loop_size == 1) {
2089 extract(partidx, npacks);
2090 factorize(member, i0, nrows, 0, internal_vector_values, WW);
2092 Kokkos::parallel_for
2093 (Kokkos::ThreadVectorRange(member, vector_loop_size), [&](
const local_ordinal_type &v) {
2094 const local_ordinal_type vbeg = v*internal_vector_length;
2096 extract(member, partidx+vbeg, npacks, vbeg);
2097 factorize(member, i0, nrows, v, internal_vector_values, WW);
2103 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2104 const local_ordinal_type team_size =
2105 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2106 recommended_team_size(blocksize, vector_length, internal_vector_length);
2107 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
2108 shmem_size(blocksize, blocksize, vector_loop_size);
2110 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
2111 policy(packptr.extent(0)-1, team_size, vector_loop_size);
2112 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2113 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2114 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *
this);
2116 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
2117 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
2120 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2128 template<
typename MatrixType>
2131 const PartInterface<MatrixType> &interf,
2133 const typename ImplType<MatrixType>::magnitude_type tiny) {
2134 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::NumericPhase");
2135 ExtractAndFactorizeTridiags<MatrixType>
function(btdm, interf, A, tiny);
2142 template<
typename MatrixType>
2146 using execution_space =
typename impl_type::execution_space;
2147 using memory_space =
typename impl_type::memory_space;
2149 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2151 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2152 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
2153 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2154 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2155 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
2156 static constexpr
int vector_length = impl_type::vector_length;
2158 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
2162 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2163 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2164 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2165 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2166 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2167 const local_ordinal_type blocksize;
2168 const local_ordinal_type num_vectors;
2171 vector_type_3d_view packed_multivector;
2172 impl_scalar_type_2d_view_tpetra scalar_multivector;
2174 template<
typename TagType>
2175 KOKKOS_INLINE_FUNCTION
2176 void copy_multivectors(
const local_ordinal_type &j,
2177 const local_ordinal_type &vi,
2178 const local_ordinal_type &pri,
2179 const local_ordinal_type &ri0)
const {
2180 for (local_ordinal_type col=0;col<num_vectors;++col)
2181 for (local_ordinal_type i=0;i<blocksize;++i)
2182 packed_multivector(pri, i, col)[vi] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2188 const vector_type_3d_view &pmv)
2189 : partptr(interf.partptr),
2190 packptr(interf.packptr),
2191 part2packrowidx0(interf.part2packrowidx0),
2192 part2rowidx0(interf.part2rowidx0),
2193 lclrow(interf.lclrow),
2194 blocksize(pmv.extent(1)),
2195 num_vectors(pmv.extent(2)),
2196 packed_multivector(pmv) {}
2201 operator() (
const local_ordinal_type &packidx)
const {
2202 local_ordinal_type partidx = packptr(packidx);
2203 local_ordinal_type npacks = packptr(packidx+1) - partidx;
2204 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2206 local_ordinal_type ri0[vector_length] = {};
2207 local_ordinal_type nrows[vector_length] = {};
2208 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
2209 ri0[v] = part2rowidx0(partidx);
2210 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
2212 for (local_ordinal_type j=0;j<nrows[0];++j) {
2213 local_ordinal_type cnt = 1;
2214 for (;cnt<npacks && j!= nrows[cnt];++cnt);
2216 const local_ordinal_type pri = pri0 + j;
2217 for (local_ordinal_type col=0;col<num_vectors;++col)
2218 for (local_ordinal_type i=0;i<blocksize;++i)
2219 for (local_ordinal_type v=0;v<npacks;++v)
2220 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
2224 KOKKOS_INLINE_FUNCTION
2226 operator() (
const member_type &member)
const {
2227 const local_ordinal_type packidx = member.league_rank();
2228 const local_ordinal_type partidx_begin = packptr(packidx);
2229 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
2230 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
2231 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
2232 const local_ordinal_type partidx = partidx_begin + v;
2233 const local_ordinal_type ri0 = part2rowidx0(partidx);
2234 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
2237 const local_ordinal_type pri = pri0;
2238 for (local_ordinal_type col=0;col<num_vectors;++col) {
2239 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](
const local_ordinal_type &i) {
2240 packed_multivector(pri, i, col)[v] =
static_cast<btdm_scalar_type
>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
2244 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
2245 const local_ordinal_type pri = pri0 + j;
2246 for (local_ordinal_type col=0;col<num_vectors;++col)
2247 for (local_ordinal_type i=0;i<blocksize;++i)
2248 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
2254 void run(
const impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
2255 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2256 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::MultiVectorConverter");
2258 scalar_multivector = scalar_multivector_;
2260 #if defined(KOKKOS_ENABLE_CUDA)
2261 const local_ordinal_type vl = vector_length;
2262 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
2263 Kokkos::parallel_for
2264 (
"MultiVectorConverter::TeamPolicy", policy, *
this);
2267 #if defined(__CUDA_ARCH__)
2270 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
2271 Kokkos::parallel_for
2272 (
"MultiVectorConverter::RangePolicy", policy, *
this);
2275 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2282 template<
typename ArgActiveExecutionMemorySpace>
2287 typedef KB::Mode::Serial mode_type;
2288 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2289 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2290 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
2292 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
2294 static int recommended_team_size(
const int ,
2301 #if defined(KOKKOS_ENABLE_CUDA)
2302 static inline int SolveTridiagsRecommendedCudaTeamSize(
const int blksize,
2303 const int vector_length,
2304 const int internal_vector_length) {
2305 const int vector_size = vector_length/internal_vector_length;
2306 int total_team_size(0);
2307 if (blksize <= 5) total_team_size = 32;
2308 else if (blksize <= 9) total_team_size = 32;
2309 else if (blksize <= 12) total_team_size = 96;
2310 else if (blksize <= 16) total_team_size = 128;
2311 else if (blksize <= 20) total_team_size = 160;
2312 else total_team_size = 160;
2313 return total_team_size/vector_size;
2317 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2318 typedef KB::Mode::Team mode_type;
2319 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2320 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2321 static int recommended_team_size(
const int blksize,
2322 const int vector_length,
2323 const int internal_vector_length) {
2324 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2328 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2329 typedef KB::Mode::Team mode_type;
2330 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
2331 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
2332 static int recommended_team_size(
const int blksize,
2333 const int vector_length,
2334 const int internal_vector_length) {
2335 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2340 template<
typename MatrixType>
2341 struct SolveTridiags {
2343 using impl_type = ImplType<MatrixType>;
2344 using execution_space =
typename impl_type::execution_space;
2346 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2349 using magnitude_type =
typename impl_type::magnitude_type;
2350 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2351 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2353 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2356 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2357 using internal_vector_type_4d_view =
typename impl_type::internal_vector_type_4d_view;
2360 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2362 using internal_vector_type =
typename impl_type::internal_vector_type;
2363 static constexpr
int vector_length = impl_type::vector_length;
2364 static constexpr
int internal_vector_length = impl_type::internal_vector_length;
2367 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
2368 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
2371 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2372 using member_type =
typename team_policy_type::member_type;
2376 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2377 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2378 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2379 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2382 const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2385 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
2386 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
2388 const local_ordinal_type vector_loop_size;
2391 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
2392 #if defined(__CUDA_ARCH__)
2393 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2395 Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2397 const impl_scalar_type df;
2398 const bool compute_diff;
2401 SolveTridiags(
const PartInterface<MatrixType> &interf,
2402 const BlockTridiags<MatrixType> &btdm,
2403 const vector_type_3d_view &pmv,
2404 const impl_scalar_type damping_factor,
2405 const bool is_norm_manager_active)
2408 partptr(interf.partptr),
2409 packptr(interf.packptr),
2410 part2packrowidx0(interf.part2packrowidx0),
2411 lclrow(interf.lclrow),
2413 pack_td_ptr(btdm.pack_td_ptr),
2414 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
2415 btdm.values.extent(0),
2416 btdm.values.extent(1),
2417 btdm.values.extent(2),
2418 vector_length/internal_vector_length),
2419 X_internal_vector_values((internal_vector_type*)pmv.data(),
2423 vector_length/internal_vector_length),
2424 vector_loop_size(vector_length/internal_vector_length),
2425 Y_scalar_multivector(),
2428 compute_diff(is_norm_manager_active)
2434 KOKKOS_INLINE_FUNCTION
2436 copyToFlatMultiVector(
const member_type &member,
2437 const local_ordinal_type partidxbeg,
2438 const local_ordinal_type npacks,
2439 const local_ordinal_type pri0,
2440 const local_ordinal_type v,
2441 const local_ordinal_type blocksize,
2442 const local_ordinal_type num_vectors)
const {
2443 const local_ordinal_type vbeg = v*internal_vector_length;
2444 if (vbeg < npacks) {
2445 local_ordinal_type ri0_vals[internal_vector_length] = {};
2446 local_ordinal_type nrows_vals[internal_vector_length] = {};
2447 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2448 const local_ordinal_type partidx = partidxbeg+vv;
2449 ri0_vals[vi] = partptr(partidx);
2450 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
2453 impl_scalar_type z_partial_sum(0);
2454 if (nrows_vals[0] == 1) {
2455 const local_ordinal_type j=0, pri=pri0;
2457 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2458 const local_ordinal_type ri0 = ri0_vals[vi];
2459 const local_ordinal_type nrows = nrows_vals[vi];
2461 Kokkos::parallel_for
2462 (Kokkos::TeamThreadRange(member, blocksize),
2463 [&](
const local_ordinal_type &i) {
2464 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2465 for (local_ordinal_type col=0;col<num_vectors;++col) {
2466 impl_scalar_type &y = Y_scalar_multivector(row,col);
2467 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2471 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2472 z_partial_sum += yd_abs*yd_abs;
2480 Kokkos::parallel_for
2481 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
2482 [&](
const local_ordinal_type &j) {
2483 const local_ordinal_type pri = pri0 + j;
2484 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2485 const local_ordinal_type ri0 = ri0_vals[vi];
2486 const local_ordinal_type nrows = nrows_vals[vi];
2488 for (local_ordinal_type col=0;col<num_vectors;++col) {
2489 for (local_ordinal_type i=0;i<blocksize;++i) {
2490 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2491 impl_scalar_type &y = Y_scalar_multivector(row,col);
2492 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2496 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2497 z_partial_sum += yd_abs*yd_abs;
2506 Z_scalar_vector(member.league_rank()) += z_partial_sum;
2513 template<
typename WWViewType>
2514 KOKKOS_INLINE_FUNCTION
2516 solveSingleVector(
const member_type &member,
2517 const local_ordinal_type &blocksize,
2518 const local_ordinal_type &i0,
2519 const local_ordinal_type &r0,
2520 const local_ordinal_type &nrows,
2521 const local_ordinal_type &v,
2522 const WWViewType &WW)
const {
2523 typedef SolveTridiagsDefaultModeAndAlgo
2524 <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2525 typedef default_mode_and_algo_type::mode_type default_mode_type;
2526 typedef default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2529 auto A = D_internal_vector_values.data();
2530 auto X = X_internal_vector_values.data();
2533 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2534 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2538 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2539 const local_ordinal_type as0 = D_internal_vector_values.stride_1();
2540 const local_ordinal_type as1 = D_internal_vector_values.stride_2();
2541 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2542 const local_ordinal_type xs0 = X_internal_vector_values.stride_1();
2555 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2556 (default_mode_type,default_algo_type,
2559 blocksize,blocksize,
2564 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2565 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2566 (default_mode_type,default_algo_type,
2568 blocksize, blocksize,
2570 A+2*astep, as0, as1,
2575 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2576 (default_mode_type,default_algo_type,
2579 blocksize,blocksize,
2581 A+3*astep, as0, as1,
2589 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2590 (default_mode_type,default_algo_type,
2593 blocksize, blocksize,
2598 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2600 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2601 (default_mode_type,default_algo_type,
2603 blocksize, blocksize,
2605 A+1*astep, as0, as1,
2610 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2611 (default_mode_type,default_algo_type,
2614 blocksize, blocksize,
2624 const local_ordinal_type ws0 = WW.stride_0();
2625 auto W = WW.data() + v;
2626 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2628 member, blocksize, X, xs0, W, ws0);
2629 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2630 (default_mode_type,default_algo_type,
2632 blocksize, blocksize,
2641 template<
typename WWViewType>
2642 KOKKOS_INLINE_FUNCTION
2644 solveMultiVector(
const member_type &member,
2645 const local_ordinal_type &,
2646 const local_ordinal_type &i0,
2647 const local_ordinal_type &r0,
2648 const local_ordinal_type &nrows,
2649 const local_ordinal_type &v,
2650 const WWViewType &WW)
const {
2651 typedef SolveTridiagsDefaultModeAndAlgo
2652 <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2653 typedef default_mode_and_algo_type::mode_type default_mode_type;
2654 typedef default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2657 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2658 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2661 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2662 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2665 local_ordinal_type i = i0, r = r0;
2670 KB::Trsm<member_type,
2671 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2672 default_mode_type,default_algo_type>
2673 ::invoke(member, one, A, X1);
2674 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2675 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2676 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2677 KB::Gemm<member_type,
2678 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2679 default_mode_type,default_algo_type>
2680 ::invoke(member, -one, A, X1, one, X2);
2681 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2682 KB::Trsm<member_type,
2683 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2684 default_mode_type,default_algo_type>
2685 ::invoke(member, one, A, X2);
2686 X1.assign_data( X2.data() );
2690 KB::Trsm<member_type,
2691 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2692 default_mode_type,default_algo_type>
2693 ::invoke(member, one, A, X1);
2694 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2696 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2697 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2698 KB::Gemm<member_type,
2699 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2700 default_mode_type,default_algo_type>
2701 ::invoke(member, -one, A, X1, one, X2);
2702 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2703 KB::Trsm<member_type,
2704 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2705 default_mode_type,default_algo_type>
2706 ::invoke(member, one, A, X2);
2707 X1.assign_data( X2.data() );
2711 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2712 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2713 ::invoke(member, X1, W);
2714 KB::Gemm<member_type,
2715 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2716 default_mode_type,default_algo_type>
2717 ::invoke(member, one, A, W, zero, X1);
2721 template<
int B>
struct SingleVectorTag {};
2722 template<
int B>
struct MultiVectorTag {};
2725 KOKKOS_INLINE_FUNCTION
2727 operator() (
const SingleVectorTag<B> &,
const member_type &member)
const {
2728 const local_ordinal_type packidx = member.league_rank();
2729 const local_ordinal_type partidx = packptr(packidx);
2730 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2731 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2732 const local_ordinal_type i0 = pack_td_ptr(partidx);
2733 const local_ordinal_type r0 = part2packrowidx0(partidx);
2734 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2735 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2736 const local_ordinal_type num_vectors = 1;
2737 internal_vector_scratch_type_3d_view
2738 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
2739 Kokkos::single(Kokkos::PerTeam(member), [&]() {
2740 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2742 Kokkos::parallel_for
2743 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
2744 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
2745 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2750 KOKKOS_INLINE_FUNCTION
2752 operator() (
const MultiVectorTag<B> &,
const member_type &member)
const {
2753 const local_ordinal_type packidx = member.league_rank();
2754 const local_ordinal_type partidx = packptr(packidx);
2755 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2756 const local_ordinal_type pri0 = part2packrowidx0(partidx);
2757 const local_ordinal_type i0 = pack_td_ptr(partidx);
2758 const local_ordinal_type r0 = part2packrowidx0(partidx);
2759 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2760 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2761 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2763 internal_vector_scratch_type_3d_view
2764 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
2765 Kokkos::single(Kokkos::PerTeam(member), [&]() {
2766 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2768 Kokkos::parallel_for
2769 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](
const int &v) {
2770 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
2771 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2775 void run(
const impl_scalar_type_2d_view_tpetra &Y,
2776 const impl_scalar_type_1d_view &Z) {
2777 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
2778 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::SolveTridiags");
2781 this->Y_scalar_multivector = Y;
2782 this->Z_scalar_vector = Z;
2784 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2785 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
2787 const local_ordinal_type team_size =
2788 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2789 recommended_team_size(blocksize, vector_length, internal_vector_length);
2790 const int per_team_scratch = internal_vector_scratch_type_3d_view
2791 ::shmem_size(blocksize, num_vectors, vector_loop_size);
2793 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2794 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2795 if (num_vectors == 1) { \
2796 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2797 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2798 Kokkos::parallel_for \
2799 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2800 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2802 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2803 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2804 Kokkos::parallel_for \
2805 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2806 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2809 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2810 if (num_vectors == 1) { \
2811 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2812 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2813 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2814 Kokkos::parallel_for \
2815 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2818 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2819 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2820 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2821 Kokkos::parallel_for \
2822 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2826 switch (blocksize) {
2827 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2828 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2829 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2830 case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2831 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2832 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2833 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2834 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2835 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2836 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2838 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
2840 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2848 const int team_size) {
2849 int total_team_size(0);
2850 if (blksize <= 5) total_team_size = 32;
2851 else if (blksize <= 9) total_team_size = 32;
2852 else if (blksize <= 12) total_team_size = 96;
2853 else if (blksize <= 16) total_team_size = 128;
2854 else if (blksize <= 20) total_team_size = 160;
2855 else total_team_size = 160;
2856 return total_team_size/team_size;
2859 template<
typename MatrixType>
2860 struct ComputeResidualVector {
2862 using impl_type = ImplType<MatrixType>;
2864 using execution_space =
typename impl_type::execution_space;
2865 using memory_space =
typename impl_type::memory_space;
2867 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2870 using magnitude_type =
typename impl_type::magnitude_type;
2871 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
2872 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
2874 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
2876 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
2877 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
2878 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
2879 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2880 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
2881 static constexpr
int vector_length = impl_type::vector_length;
2884 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
2887 enum :
int { max_blocksize = 32 };
2890 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
2891 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
2892 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
2893 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
2894 Unmanaged<vector_type_3d_view> y_packed;
2895 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
2898 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
2899 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
2900 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
2904 using a_rowptr_value_type =
typename Kokkos::ViewTraits<local_ordinal_type*,node_device_type>::size_type;
2905 const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,node_device_type> > A_rowptr;
2906 const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
2909 const local_ordinal_type blocksize_requested;
2912 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2913 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2914 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
2915 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2916 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2917 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
2918 const bool is_dm2cm_active;
2921 template<
typename LocalCrsGraphType>
2922 ComputeResidualVector(
const AmD<MatrixType> &amd,
2923 const LocalCrsGraphType &graph,
2924 const local_ordinal_type &blocksize_requested_,
2925 const PartInterface<MatrixType> &interf,
2926 const local_ordinal_type_1d_view &dm2cm_)
2927 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
2928 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
2929 tpetra_values(amd.tpetra_values),
2930 A_rowptr(graph.row_map),
2931 A_colind(graph.entries),
2932 blocksize_requested(blocksize_requested_),
2933 part2packrowidx0(interf.part2packrowidx0),
2934 part2rowidx0(interf.part2rowidx0),
2935 rowidx2part(interf.rowidx2part),
2936 partptr(interf.partptr),
2937 lclrow(interf.lclrow),
2939 is_dm2cm_active(dm2cm_.span() > 0)
2944 SerialGemv(
const local_ordinal_type &blocksize,
2945 const impl_scalar_type *
const KOKKOS_RESTRICT AA,
2946 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
2947 impl_scalar_type * KOKKOS_RESTRICT yy)
const {
2948 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
2949 impl_scalar_type val = 0;
2950 const local_ordinal_type offset = k0*blocksize;
2951 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
2954 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
2957 for (local_ordinal_type k1=0;k1<blocksize;++k1)
2958 val += AA[offset+k1]*xx[k1];
2963 template<
typename bbViewType,
typename yyViewType>
2964 KOKKOS_INLINE_FUNCTION
2966 VectorCopy(
const member_type &member,
2967 const local_ordinal_type &blocksize,
2968 const bbViewType &bb,
2969 const yyViewType &yy)
const {
2970 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0) {
2971 yy(k0) =
static_cast<typename yyViewType::const_value_type
>(bb(k0));
2975 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
2976 KOKKOS_INLINE_FUNCTION
2978 TeamVectorGemv(
const member_type &member,
2979 const local_ordinal_type &blocksize,
2980 const AAViewType &AA,
2981 const xxViewType &xx,
2982 const yyViewType &yy)
const {
2983 Kokkos::parallel_for
2984 (Kokkos::TeamThreadRange(member, blocksize),
2985 [&](
const local_ordinal_type &k0) {
2986 impl_scalar_type val = 0;
2987 Kokkos::parallel_for
2988 (Kokkos::ThreadVectorRange(member, blocksize),
2989 [&](
const local_ordinal_type &k1) {
2990 val += AA(k0,k1)*xx(k1);
2992 Kokkos::atomic_fetch_add(&yy(k0),
typename yyViewType::const_value_type(-val));
2996 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
2997 KOKKOS_INLINE_FUNCTION
2999 VectorGemv(
const member_type &member,
3000 const local_ordinal_type &blocksize,
3001 const AAViewType &AA,
3002 const xxViewType &xx,
3003 const yyViewType &yy)
const {
3004 Kokkos::parallel_for
3005 (Kokkos::ThreadVectorRange(member, blocksize),
3006 [&](
const local_ordinal_type &k0) {
3007 impl_scalar_type val(0);
3008 for (local_ordinal_type k1=0;k1<blocksize;++k1) {
3009 val += AA(k0,k1)*xx(k1);
3011 Kokkos::atomic_fetch_add(&yy(k0),
typename yyViewType::const_value_type(-val));
3038 operator() (
const SeqTag &,
const local_ordinal_type& i)
const {
3039 const local_ordinal_type blocksize = blocksize_requested;
3040 const local_ordinal_type blocksize_square = blocksize*blocksize;
3043 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3044 const local_ordinal_type num_vectors = y.extent(1);
3045 const local_ordinal_type row = i*blocksize;
3046 for (local_ordinal_type col=0;col<num_vectors;++col) {
3048 impl_scalar_type *yy = &y(row, col);
3049 const impl_scalar_type *
const bb = &b(row, col);
3050 memcpy(yy, bb,
sizeof(impl_scalar_type)*blocksize);
3053 const size_type A_k0 = A_rowptr[i];
3054 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
3055 const size_type j = A_k0 + colindsub[k];
3056 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
3057 const impl_scalar_type *
const xx = &x(A_colind[j]*blocksize, col);
3058 SerialGemv(blocksize,AA,xx,yy);
3063 KOKKOS_INLINE_FUNCTION
3065 operator() (
const SeqTag &,
const member_type &member)
const {
3068 const local_ordinal_type blocksize = blocksize_requested;
3069 const local_ordinal_type blocksize_square = blocksize*blocksize;
3071 const local_ordinal_type lr = member.league_rank();
3072 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3073 const local_ordinal_type num_vectors = y.extent(1);
3076 auto bb = Kokkos::subview(b, block_range, 0);
3078 auto yy = Kokkos::subview(y, block_range, 0);
3079 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3081 const local_ordinal_type row = lr*blocksize;
3082 for (local_ordinal_type col=0;col<num_vectors;++col) {
3084 yy.assign_data(&y(row, col));
3085 bb.assign_data(&b(row, col));
3086 if (member.team_rank() == 0)
3087 VectorCopy(member, blocksize, bb, yy);
3088 member.team_barrier();
3091 const size_type A_k0 = A_rowptr[lr];
3092 Kokkos::parallel_for
3093 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3094 [&](
const local_ordinal_type &k) {
3095 const size_type j = A_k0 + colindsub[k];
3096 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3097 xx.assign_data( &x(A_colind[j]*blocksize, col) );
3098 VectorGemv(member, blocksize, A_block, xx, yy);
3109 operator() (
const AsyncTag<B> &,
const local_ordinal_type &rowidx)
const {
3110 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3111 const local_ordinal_type blocksize_square = blocksize*blocksize;
3114 const local_ordinal_type partidx = rowidx2part(rowidx);
3115 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3116 const local_ordinal_type v = partidx % vector_length;
3118 const local_ordinal_type num_vectors = y_packed.extent(2);
3119 const local_ordinal_type num_local_rows = lclrow.extent(0);
3122 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
3124 const local_ordinal_type lr = lclrow(rowidx);
3125 const local_ordinal_type row = lr*blocksize;
3126 for (local_ordinal_type col=0;col<num_vectors;++col) {
3128 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
3131 const size_type A_k0 = A_rowptr[lr];
3132 for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
3133 const size_type j = A_k0 + colindsub[k];
3134 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
3135 const local_ordinal_type A_colind_at_j = A_colind[j];
3136 if (A_colind_at_j < num_local_rows) {
3137 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3138 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
3139 SerialGemv(blocksize, AA,xx,yy);
3141 const auto loc = A_colind_at_j - num_local_rows;
3142 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
3143 SerialGemv(blocksize, AA,xx_remote,yy);
3147 for (local_ordinal_type k=0;k<blocksize;++k)
3148 y_packed(pri, k, col)[v] = yy[k];
3153 KOKKOS_INLINE_FUNCTION
3155 operator() (
const AsyncTag<B> &,
const member_type &member)
const {
3156 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3157 const local_ordinal_type blocksize_square = blocksize*blocksize;
3160 const local_ordinal_type rowidx = member.league_rank();
3161 const local_ordinal_type partidx = rowidx2part(rowidx);
3162 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3163 const local_ordinal_type v = partidx % vector_length;
3165 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3166 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3167 const local_ordinal_type num_local_rows = lclrow.extent(0);
3170 auto bb = Kokkos::subview(b, block_range, 0);
3171 auto xx = Kokkos::subview(x, block_range, 0);
3172 auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
3173 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3174 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3176 const local_ordinal_type lr = lclrow(rowidx);
3177 const local_ordinal_type row = lr*blocksize;
3178 for (local_ordinal_type col=0;col<num_vectors;++col) {
3180 bb.assign_data(&b(row, col));
3181 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3182 if (member.team_rank() == 0)
3183 VectorCopy(member, blocksize, bb, yy);
3184 member.team_barrier();
3187 const size_type A_k0 = A_rowptr[lr];
3188 Kokkos::parallel_for
3189 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
3190 [&](
const local_ordinal_type &k) {
3191 const size_type j = A_k0 + colindsub[k];
3192 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3194 const local_ordinal_type A_colind_at_j = A_colind[j];
3195 if (A_colind_at_j < num_local_rows) {
3196 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3197 xx.assign_data( &x(loc*blocksize, col) );
3198 VectorGemv(member, blocksize, A_block, xx, yy);
3200 const auto loc = A_colind_at_j - num_local_rows;
3201 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3202 VectorGemv(member, blocksize, A_block, xx_remote, yy);
3208 template <
int P,
int B>
struct OverlapTag {};
3210 template<
int P,
int B>
3213 operator() (
const OverlapTag<P,B> &,
const local_ordinal_type& rowidx)
const {
3214 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3215 const local_ordinal_type blocksize_square = blocksize*blocksize;
3218 const local_ordinal_type partidx = rowidx2part(rowidx);
3219 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3220 const local_ordinal_type v = partidx % vector_length;
3222 const local_ordinal_type num_vectors = y_packed.extent(2);
3223 const local_ordinal_type num_local_rows = lclrow.extent(0);
3226 impl_scalar_type yy[max_blocksize] = {};
3228 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3229 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3231 const local_ordinal_type lr = lclrow(rowidx);
3232 const local_ordinal_type row = lr*blocksize;
3233 for (local_ordinal_type col=0;col<num_vectors;++col) {
3236 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
3239 memset(yy, 0,
sizeof(impl_scalar_type)*blocksize);
3243 const size_type A_k0 = A_rowptr[lr];
3244 for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
3245 const size_type j = A_k0 + colindsub_used[k];
3246 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
3247 const local_ordinal_type A_colind_at_j = A_colind[j];
3249 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3250 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
3251 SerialGemv(blocksize,AA,xx,yy);
3253 const auto loc = A_colind_at_j - num_local_rows;
3254 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
3255 SerialGemv(blocksize,AA,xx_remote,yy);
3260 for (local_ordinal_type k=0;k<blocksize;++k)
3261 y_packed(pri, k, col)[v] = yy[k];
3263 for (local_ordinal_type k=0;k<blocksize;++k)
3264 y_packed(pri, k, col)[v] += yy[k];
3269 template<
int P,
int B>
3270 KOKKOS_INLINE_FUNCTION
3272 operator() (
const OverlapTag<P,B> &,
const member_type &member)
const {
3273 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
3274 const local_ordinal_type blocksize_square = blocksize*blocksize;
3277 const local_ordinal_type rowidx = member.league_rank();
3278 const local_ordinal_type partidx = rowidx2part(rowidx);
3279 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
3280 const local_ordinal_type v = partidx % vector_length;
3282 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
3283 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
3284 const local_ordinal_type num_local_rows = lclrow.extent(0);
3287 auto bb = Kokkos::subview(b, block_range, 0);
3289 auto xx_remote = bb;
3290 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3291 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3292 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3293 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3295 const local_ordinal_type lr = lclrow(rowidx);
3296 const local_ordinal_type row = lr*blocksize;
3297 for (local_ordinal_type col=0;col<num_vectors;++col) {
3298 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3301 bb.assign_data(&b(row, col));
3302 if (member.team_rank() == 0)
3303 VectorCopy(member, blocksize, bb, yy);
3304 member.team_barrier();
3308 const size_type A_k0 = A_rowptr[lr];
3309 Kokkos::parallel_for
3310 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
3311 [&](
const local_ordinal_type &k) {
3312 const size_type j = A_k0 + colindsub_used[k];
3313 A_block.assign_data( &tpetra_values(j*blocksize_square) );
3315 const local_ordinal_type A_colind_at_j = A_colind[j];
3317 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3318 xx.assign_data( &x(loc*blocksize, col) );
3319 VectorGemv(member, blocksize, A_block, xx, yy);
3321 const auto loc = A_colind_at_j - num_local_rows;
3322 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3323 VectorGemv(member, blocksize, A_block, xx_remote, yy);
3330 template<
typename MultiVectorLocalViewTypeY,
3331 typename MultiVectorLocalViewTypeB,
3332 typename MultiVectorLocalViewTypeX>
3333 void run(
const MultiVectorLocalViewTypeY &y_,
3334 const MultiVectorLocalViewTypeB &b_,
3335 const MultiVectorLocalViewTypeX &x_) {
3336 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3337 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::ComputeResidual::<SeqTag>");
3339 y = y_; b = b_; x = x_;
3340 if (is_cuda<execution_space>::value) {
3341 #if defined(KOKKOS_ENABLE_CUDA)
3342 const local_ordinal_type blocksize = blocksize_requested;
3343 const local_ordinal_type team_size = 8;
3345 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3346 Kokkos::parallel_for
3347 (
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, *
this);
3350 #if defined(__CUDA_ARCH__)
3353 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
3354 Kokkos::parallel_for
3355 (
"ComputeResidual::RangePolicy::run<SeqTag>", policy, *
this);
3358 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3362 template<
typename MultiVectorLocalViewTypeB,
3363 typename MultiVectorLocalViewTypeX,
3364 typename MultiVectorLocalViewTypeX_Remote>
3365 void run(
const vector_type_3d_view &y_packed_,
3366 const MultiVectorLocalViewTypeB &b_,
3367 const MultiVectorLocalViewTypeX &x_,
3368 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
3369 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3370 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::ComputeResidual::<AsyncTag>");
3372 b = b_; x = x_; x_remote = x_remote_;
3373 if (is_cuda<execution_space>::value) {
3374 #if defined(KOKKOS_ENABLE_CUDA)
3375 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3376 y_packed_.extent(0),
3377 y_packed_.extent(1),
3378 y_packed_.extent(2),
3382 y_packed = y_packed_;
3385 if (is_cuda<execution_space>::value) {
3386 #if defined(KOKKOS_ENABLE_CUDA)
3387 const local_ordinal_type blocksize = blocksize_requested;
3388 const local_ordinal_type team_size = 8;
3394 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3395 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3396 policy(rowidx2part.extent(0), team_size, vector_size); \
3397 Kokkos::parallel_for \
3398 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3399 policy, *this); } break
3400 switch (blocksize_requested) {
3401 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3402 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3403 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3404 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3405 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3406 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3407 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3408 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3409 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3410 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3412 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3415 #if defined(__CUDA_ARCH__)
3418 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3419 const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3420 Kokkos::parallel_for \
3421 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3422 policy, *this); } break
3423 switch (blocksize_requested) {
3424 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3425 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3426 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3427 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3428 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3429 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3430 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3431 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3432 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3433 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3435 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3438 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3442 template<
typename MultiVectorLocalViewTypeB,
3443 typename MultiVectorLocalViewTypeX,
3444 typename MultiVectorLocalViewTypeX_Remote>
3445 void run(
const vector_type_3d_view &y_packed_,
3446 const MultiVectorLocalViewTypeB &b_,
3447 const MultiVectorLocalViewTypeX &x_,
3448 const MultiVectorLocalViewTypeX_Remote &x_remote_,
3449 const bool compute_owned) {
3450 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3451 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::ComputeResidual::<OverlapTag>");
3453 b = b_; x = x_; x_remote = x_remote_;
3454 if (is_cuda<execution_space>::value) {
3455 #if defined(KOKKOS_ENABLE_CUDA)
3456 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
3457 y_packed_.extent(0),
3458 y_packed_.extent(1),
3459 y_packed_.extent(2),
3463 y_packed = y_packed_;
3466 if (is_cuda<execution_space>::value) {
3467 #if defined(KOKKOS_ENABLE_CUDA)
3468 const local_ordinal_type blocksize = blocksize_requested;
3469 const local_ordinal_type team_size = 8;
3475 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3476 if (compute_owned) { \
3477 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3478 policy(rowidx2part.extent(0), team_size, vector_size); \
3479 Kokkos::parallel_for \
3480 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3482 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3483 policy(rowidx2part.extent(0), team_size, vector_size); \
3484 Kokkos::parallel_for \
3485 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3487 switch (blocksize_requested) {
3488 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3489 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3490 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3491 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3492 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3493 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3494 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3495 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3496 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3497 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3499 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3502 #if defined(__CUDA_ARCH__)
3505 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3506 if (compute_owned) { \
3507 const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
3508 policy(0, rowidx2part.extent(0)); \
3509 Kokkos::parallel_for \
3510 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3512 const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
3513 policy(0, rowidx2part.extent(0)); \
3514 Kokkos::parallel_for \
3515 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3518 switch (blocksize_requested) {
3519 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3520 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3521 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3522 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3523 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3524 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3525 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3526 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3527 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3528 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3530 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3533 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3537 template<
typename MatrixType>
3538 void reduceVector(
const ConstUnmanaged<
typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
3539 typename ImplType<MatrixType>::magnitude_type *vals) {
3540 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3541 IFPACK2_BLOCKTRIDICONTAINER_TIMER(
"BlockTriDi::ReduceVector");
3543 using impl_type = ImplType<MatrixType>;
3544 using local_ordinal_type =
typename impl_type::local_ordinal_type;
3545 using impl_scalar_type =
typename impl_type::impl_scalar_type;
3547 const auto norm2 = KokkosBlas::nrm1(zz);
3549 impl_scalar_type norm2(0);
3550 Kokkos::parallel_reduce
3551 (
"ReduceMultiVector::Device",
3552 Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
3553 KOKKOS_LAMBDA(
const local_ordinal_type &i, impl_scalar_type &update) {
3557 vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
3559 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3565 template<
typename MatrixType>
3570 using magnitude_type =
typename impl_type::magnitude_type;
3574 int sweep_step_, sweep_step_upper_bound_;
3575 #ifdef HAVE_IFPACK2_MPI
3576 MPI_Request mpi_request_;
3579 magnitude_type work_[3];
3586 sweep_step_upper_bound_ = 1;
3587 collective_ = comm->getSize() > 1;
3589 #ifdef HAVE_IFPACK2_MPI
3590 const auto mpi_comm = Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int> >(comm);
3592 comm_ = *mpi_comm->getRawMpiComm();
3595 const magnitude_type zero(0), minus_one(-1);
3598 work_[2] = minus_one;
3602 void setCheckFrequency(
const int sweep_step) {
3604 sweep_step_upper_bound_ = sweep_step;
3608 // Get the buffer into which to store rank-local squared norms.
3609 magnitude_type* getBuffer() { return &work_[0]; }
3611 // Call MPI_Iallreduce to find the global squared norms.
3612 void ireduce(const int sweep, const bool force = false) {
3613 if ( ! force && sweep % sweep_step_) return;
3615 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce
");
3617 work_[1] = work_[0];
3618 auto send_data = &work_[1];
3619 auto recv_data = &work_[0];
3621 #ifdef HAVE_IFPACK2_MPI
3622 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3623 MPI_Iallreduce(send_data, recv_data, 1,
3624 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3625 MPI_SUM, comm_, &mpi_request_);
3627 MPI_Allreduce (send_data, recv_data, 1,
3628 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3635 // Check if the norm-based termination criterion is met. tol2 is the
3636 // tolerance squared. Sweep is the sweep index. If not every iteration is
3637 // being checked, this function immediately returns false. If a check must
3638 // be done at this iteration, it waits for the reduction triggered by
3639 // ireduce to complete, then checks the global norm against the tolerance.
3640 bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
3642 if (sweep <= 0) return false;
3644 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone
");
3646 TEUCHOS_ASSERT(sweep >= 1);
3647 if ( ! force && (sweep - 1) % sweep_step_) return false;
3649 #ifdef HAVE_IFPACK2_MPI
3650 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3651 MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3659 work_[2] = work_[0];
3661 r_val = (work_[0] < tol2*work_[2]);
3664 // adjust sweep step
3665 const auto adjusted_sweep_step = 2*sweep_step_;
3666 if (adjusted_sweep_step < sweep_step_upper_bound_) {
3667 sweep_step_ = adjusted_sweep_step;
3669 sweep_step_ = sweep_step_upper_bound_;
3674 // After termination has occurred, finalize the norms for use in
3675 // get_norms{0,final}.
3677 work_[0] = std::sqrt(work_[0]); // after converged
3679 work_[2] = std::sqrt(work_[2]); // first norm
3680 // if work_[2] is minus one, then norm is not requested.
3683 // Report norms to the caller.
3684 const magnitude_type getNorms0 () const { return work_[2]; }
3685 const magnitude_type getNormsFinal () const { return work_[0]; }
3691 template<typename MatrixType>
3693 applyInverseJacobi(// importer
3694 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3695 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3696 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3697 const bool overlap_communication_and_computation,
3699 const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3700 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3701 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
3702 /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
3703 // local object interface
3704 const PartInterface<MatrixType> &interf, // mesh interface
3705 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3706 const AmD<MatrixType> &amd, // R = A - D
3707 /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3708 /* */ NormManager<MatrixType> &norm_manager,
3709 // preconditioner parameters
3710 const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3711 /* */ bool is_y_zero,
3712 const int max_num_sweeps,
3713 const typename ImplType<MatrixType>::magnitude_type tol,
3714 const int check_tol_every) {
3715 IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi
");
3717 using impl_type = ImplType<MatrixType>;
3718 using node_memory_space = typename impl_type::node_memory_space;
3719 using local_ordinal_type = typename impl_type::local_ordinal_type;
3720 using size_type = typename impl_type::size_type;
3721 using impl_scalar_type = typename impl_type::impl_scalar_type;
3722 using magnitude_type = typename impl_type::magnitude_type;
3723 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3724 using vector_type_1d_view = typename impl_type::vector_type_1d_view;
3725 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3726 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3728 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3730 // either tpetra importer or async importer must be active
3731 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
3732 "Neither Tpetra importer nor Async importer is null.
");
3733 // max number of sweeps should be positive number
3734 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
3735 "Maximum number of sweeps must be >= 1.
");
3738 const bool is_seq_method_requested = !tpetra_importer.is_null();
3739 const bool is_async_importer_active = !async_importer.is_null();
3740 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
3741 const magnitude_type tolerance = tol*tol;
3742 const local_ordinal_type blocksize = btdm.values.extent(1);
3743 const local_ordinal_type num_vectors = Y.getNumVectors();
3744 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
3746 const impl_scalar_type zero(0.0);
3748 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
3750 "which in any
case is
for developer use only,
" <<
3751 "does not support norm-based termination.
");
3753 // if workspace is needed more, resize it
3754 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
3755 if (work.span() < work_span_required)
3756 work = vector_type_1d_view("vector workspace 1d view
", work_span_required);
3759 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
3760 if (local_ordinal_type(W.extent(0)) < W_size)
3761 W = impl_scalar_type_1d_view("W
", W_size);
3763 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
3765 if (is_seq_method_requested) {
3766 if (Z.getNumVectors() != Y.getNumVectors())
3767 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
3769 if (is_async_importer_active) {
3770 // create comm data buffer and keep it here
3771 async_importer->createDataBuffer(num_vectors);
3772 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
3777 // wrap the workspace with 3d view
3778 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
3779 const auto XX = X.template getLocalView<node_memory_space>();
3780 const auto YY = Y.template getLocalView<node_memory_space>();
3781 const auto ZZ = Z.template getLocalView<node_memory_space>();
3782 if (is_y_zero) Kokkos::deep_copy(YY, zero);
3784 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
3785 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
3786 damping_factor, is_norm_manager_active);
3788 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
3789 ComputeResidualVector<MatrixType>
3790 compute_residual_vector(amd, A->getCrsGraph().getLocalGraph(), blocksize, interf,
3791 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
3793 // norm manager workspace resize
3794 if (is_norm_manager_active)
3795 norm_manager.setCheckFrequency(check_tol_every);
3799 for (;sweep<max_num_sweeps;++sweep) {
3803 multivector_converter.run(XX);
3805 if (is_seq_method_requested) {
3806 // SEQ METHOD IS TESTING ONLY
3809 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
3810 compute_residual_vector.run(YY, XX, ZZ);
3812 // pmv := y(lclrow).
3813 multivector_converter.run(YY);
3815 // fused y := x - R y and pmv := y(lclrow);
3816 // real use case does not use overlap comp and comm
3817 if (overlap_communication_and_computation || !is_async_importer_active) {
3818 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
3819 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
3820 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
3821 if (is_async_importer_active) async_importer->cancel();
3824 if (is_async_importer_active) {
3825 async_importer->syncRecv();
3826 compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
3829 if (is_async_importer_active)
3830 async_importer->syncExchange(YY);
3831 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
3832 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
3838 // pmv := inv(D) pmv.
3840 solve_tridiags.run(YY, W);
3843 if (is_norm_manager_active) {
3844 // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
3845 reduceVector<MatrixType>(W, norm_manager.getBuffer());
3846 if (sweep + 1 == max_num_sweeps) {
3847 norm_manager.ireduce(sweep, true);
3848 norm_manager.checkDone(sweep + 1, tolerance, true);
3850 norm_manager.ireduce(sweep);
3857 //sqrt the norms for the caller's use.
3858 if (is_norm_manager_active) norm_manager.finalize();
3864 template<typename MatrixType>
3866 using impl_type = ImplType<MatrixType>;
3867 using part_interface_type = PartInterface<MatrixType>;
3868 using block_tridiags_type = BlockTridiags<MatrixType>;
3869 using amd_type = AmD<MatrixType>;
3870 using norm_manager_type = NormManager<MatrixType>;
3871 using async_import_type = AsyncableImport<MatrixType>;
3873 // distructed objects
3874 Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
3875 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
3876 Teuchos::RCP<async_import_type> async_importer;
3877 bool overlap_communication_and_computation;
3879 // copy of Y (mutable to penentrate const)
3880 mutable typename impl_type::tpetra_multivector_type Z;
3881 mutable typename impl_type::impl_scalar_type_1d_view W;
3884 part_interface_type part_interface;
3885 block_tridiags_type block_tridiags; // D
3886 amd_type a_minus_d; // R = A - D
3887 mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
3888 mutable norm_manager_type norm_manager;
3891 } // namespace BlockTriDiContainerDetails
3893 } // namespace Ifpack2
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:171
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1302
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3693
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1122
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:201
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2130
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2847
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:362
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:306
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:180
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:297
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:189
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:237
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
for the next promotion
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:127
KB::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:349
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1008
node_type::device_type node_device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:320
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3566
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:386
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:315
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1476
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1772
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1445
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1258
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2283
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:293
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2143