43 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
44 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
46 #include "Ifpack2_BlockHelper.hpp"
50 namespace BlockHelperDetails {
55 template <
typename MatrixType>
58 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
60 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
62 size_type_1d_view rowptr, rowptr_remote;
69 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
72 bool is_tpetra_block_crs;
75 impl_scalar_type_1d_view_tpetra tpetra_values;
78 AmD(
const AmD &b) =
default;
81 template<
typename MatrixType>
82 struct PartInterface {
83 using local_ordinal_type =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
84 using local_ordinal_type_1d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
85 using local_ordinal_type_2d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
87 PartInterface() =
default;
88 PartInterface(
const PartInterface &b) =
default;
108 local_ordinal_type_1d_view lclrow;
110 local_ordinal_type_1d_view partptr;
111 local_ordinal_type_2d_view partptr_sub;
112 local_ordinal_type_1d_view partptr_schur;
115 local_ordinal_type_1d_view packptr;
116 local_ordinal_type_1d_view packptr_sub;
117 local_ordinal_type_1d_view packindices_sub;
118 local_ordinal_type_2d_view packindices_schur;
121 local_ordinal_type_1d_view part2rowidx0;
122 local_ordinal_type_1d_view part2rowidx0_sub;
126 local_ordinal_type_1d_view part2packrowidx0;
127 local_ordinal_type_2d_view part2packrowidx0_sub;
128 local_ordinal_type part2packrowidx0_back;
130 local_ordinal_type_1d_view rowidx2part;
131 local_ordinal_type_1d_view rowidx2part_sub;
139 local_ordinal_type max_partsz;
140 local_ordinal_type max_subpartsz;
141 local_ordinal_type n_subparts_per_part;
142 local_ordinal_type nparts;
148 static inline int ComputeResidualVectorRecommendedCudaVectorSize(
const int blksize,
149 const int team_size) {
150 int total_team_size(0);
151 if (blksize <= 5) total_team_size = 32;
152 else if (blksize <= 9) total_team_size = 32;
153 else if (blksize <= 12) total_team_size = 96;
154 else if (blksize <= 16) total_team_size = 128;
155 else if (blksize <= 20) total_team_size = 160;
156 else total_team_size = 160;
157 return total_team_size/team_size;
160 static inline int ComputeResidualVectorRecommendedHIPVectorSize(
const int blksize,
161 const int team_size) {
162 int total_team_size(0);
163 if (blksize <= 5) total_team_size = 32;
164 else if (blksize <= 9) total_team_size = 32;
165 else if (blksize <= 12) total_team_size = 96;
166 else if (blksize <= 16) total_team_size = 128;
167 else if (blksize <= 20) total_team_size = 160;
168 else total_team_size = 160;
169 return total_team_size/team_size;
172 static inline int ComputeResidualVectorRecommendedSYCLVectorSize(
const int blksize,
173 const int team_size) {
174 int total_team_size(0);
175 if (blksize <= 5) total_team_size = 32;
176 else if (blksize <= 9) total_team_size = 32;
177 else if (blksize <= 12) total_team_size = 96;
178 else if (blksize <= 16) total_team_size = 128;
179 else if (blksize <= 20) total_team_size = 160;
180 else total_team_size = 160;
181 return total_team_size/team_size;
185 static inline int ComputeResidualVectorRecommendedVectorSize(
const int blksize,
186 const int team_size) {
187 if ( is_cuda<T>::value )
188 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
189 if ( is_hip<T>::value )
190 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
191 if ( is_sycl<T>::value )
192 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
196 template<
typename MatrixType>
197 struct ComputeResidualVector {
199 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
201 using execution_space =
typename impl_type::execution_space;
202 using memory_space =
typename impl_type::memory_space;
204 using local_ordinal_type =
typename impl_type::local_ordinal_type;
207 using magnitude_type =
typename impl_type::magnitude_type;
208 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
209 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
211 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
213 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
214 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
215 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
216 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
217 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
218 static constexpr
int vector_length = impl_type::vector_length;
221 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
224 enum :
int { max_blocksize = 32 };
227 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
228 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
229 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
230 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
231 Unmanaged<vector_type_3d_view> y_packed;
232 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
235 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
236 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
237 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
241 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
242 const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
245 const local_ordinal_type blocksize_requested;
248 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
249 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
250 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
251 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
252 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
253 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
254 const bool is_dm2cm_active;
257 template<
typename LocalCrsGraphType>
258 ComputeResidualVector(
const AmD<MatrixType> &amd,
259 const LocalCrsGraphType &graph,
260 const local_ordinal_type &blocksize_requested_,
261 const PartInterface<MatrixType> &interf,
262 const local_ordinal_type_1d_view &dm2cm_)
263 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
264 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
265 tpetra_values(amd.tpetra_values),
266 A_rowptr(graph.row_map),
267 A_colind(graph.entries),
268 blocksize_requested(blocksize_requested_),
269 part2packrowidx0(interf.part2packrowidx0),
270 part2rowidx0(interf.part2rowidx0),
271 rowidx2part(interf.rowidx2part),
272 partptr(interf.partptr),
273 lclrow(interf.lclrow),
275 is_dm2cm_active(dm2cm_.span() > 0)
280 SerialGemv(
const local_ordinal_type &blocksize,
281 const impl_scalar_type *
const KOKKOS_RESTRICT AA,
282 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
283 impl_scalar_type * KOKKOS_RESTRICT yy)
const {
284 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
285 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
286 impl_scalar_type val = 0;
287 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
290 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
293 for (local_ordinal_type k1=0;k1<blocksize;++k1)
294 val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
299 template<
typename bbViewType,
typename yyViewType>
300 KOKKOS_INLINE_FUNCTION
302 VectorCopy(
const member_type &member,
303 const local_ordinal_type &blocksize,
304 const bbViewType &bb,
305 const yyViewType &yy)
const {
306 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0) {
307 yy(k0) =
static_cast<typename yyViewType::const_value_type
>(bb(k0));
311 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
312 KOKKOS_INLINE_FUNCTION
314 TeamVectorGemv(
const member_type &member,
315 const local_ordinal_type &blocksize,
316 const AAViewType &AA,
317 const xxViewType &xx,
318 const yyViewType &yy)
const {
320 (Kokkos::TeamThreadRange(member, blocksize),
321 [&](
const local_ordinal_type &k0) {
322 impl_scalar_type val = 0;
324 (Kokkos::ThreadVectorRange(member, blocksize),
325 [&](
const local_ordinal_type &k1) {
326 val += AA(k0,k1)*xx(k1);
328 Kokkos::atomic_fetch_add(&yy(k0),
typename yyViewType::const_value_type(-val));
332 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
333 KOKKOS_INLINE_FUNCTION
335 VectorGemv(
const member_type &member,
336 const local_ordinal_type &blocksize,
337 const AAViewType &AA,
338 const xxViewType &xx,
339 const yyViewType &yy)
const {
341 (Kokkos::ThreadVectorRange(member, blocksize),
342 [&](
const local_ordinal_type &k0) {
343 impl_scalar_type val(0);
344 for (local_ordinal_type k1=0;k1<blocksize;++k1) {
345 val += AA(k0,k1)*xx(k1);
347 Kokkos::atomic_fetch_add(&yy(k0),
typename yyViewType::const_value_type(-val));
373 KOKKOS_INLINE_FUNCTION
375 operator() (
const SeqTag &,
const local_ordinal_type& i)
const {
376 const local_ordinal_type blocksize = blocksize_requested;
377 const local_ordinal_type blocksize_square = blocksize*blocksize;
380 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
381 const local_ordinal_type num_vectors = y.extent(1);
382 const local_ordinal_type row = i*blocksize;
383 for (local_ordinal_type col=0;col<num_vectors;++col) {
385 impl_scalar_type *yy = &y(row, col);
386 const impl_scalar_type *
const bb = &b(row, col);
387 memcpy(yy, bb,
sizeof(impl_scalar_type)*blocksize);
390 const size_type A_k0 = A_rowptr[i];
391 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
392 const size_type j = A_k0 + colindsub[k];
393 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
394 const impl_scalar_type *
const xx = &x(A_colind[j]*blocksize, col);
395 SerialGemv(blocksize,AA,xx,yy);
400 KOKKOS_INLINE_FUNCTION
402 operator() (
const SeqTag &,
const member_type &member)
const {
405 const local_ordinal_type blocksize = blocksize_requested;
406 const local_ordinal_type blocksize_square = blocksize*blocksize;
408 const local_ordinal_type lr = member.league_rank();
409 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
410 const local_ordinal_type num_vectors = y.extent(1);
413 auto bb = Kokkos::subview(b, block_range, 0);
415 auto yy = Kokkos::subview(y, block_range, 0);
416 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
418 const local_ordinal_type row = lr*blocksize;
419 for (local_ordinal_type col=0;col<num_vectors;++col) {
421 yy.assign_data(&y(row, col));
422 bb.assign_data(&b(row, col));
423 if (member.team_rank() == 0)
424 VectorCopy(member, blocksize, bb, yy);
425 member.team_barrier();
428 const size_type A_k0 = A_rowptr[lr];
430 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
431 [&](
const local_ordinal_type &k) {
432 const size_type j = A_k0 + colindsub[k];
433 A_block.assign_data( &tpetra_values(j*blocksize_square) );
434 xx.assign_data( &x(A_colind[j]*blocksize, col) );
435 VectorGemv(member, blocksize, A_block, xx, yy);
445 KOKKOS_INLINE_FUNCTION
447 operator() (
const AsyncTag<B> &,
const local_ordinal_type &rowidx)
const {
448 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
449 const local_ordinal_type blocksize_square = blocksize*blocksize;
452 const local_ordinal_type partidx = rowidx2part(rowidx);
453 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
454 const local_ordinal_type v = partidx % vector_length;
456 const local_ordinal_type num_vectors = y_packed.extent(2);
457 const local_ordinal_type num_local_rows = lclrow.extent(0);
460 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
462 const local_ordinal_type lr = lclrow(rowidx);
463 const local_ordinal_type row = lr*blocksize;
464 for (local_ordinal_type col=0;col<num_vectors;++col) {
466 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
469 const size_type A_k0 = A_rowptr[lr];
470 for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
471 const size_type j = A_k0 + colindsub[k];
472 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
473 const local_ordinal_type A_colind_at_j = A_colind[j];
474 if (A_colind_at_j < num_local_rows) {
475 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
476 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
477 SerialGemv(blocksize, AA,xx,yy);
479 const auto loc = A_colind_at_j - num_local_rows;
480 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
481 SerialGemv(blocksize, AA,xx_remote,yy);
485 for (local_ordinal_type k=0;k<blocksize;++k)
486 y_packed(pri, k, col)[v] = yy[k];
491 KOKKOS_INLINE_FUNCTION
493 operator() (
const AsyncTag<B> &,
const member_type &member)
const {
494 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
495 const local_ordinal_type blocksize_square = blocksize*blocksize;
498 const local_ordinal_type rowidx = member.league_rank();
499 const local_ordinal_type partidx = rowidx2part(rowidx);
500 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
501 const local_ordinal_type v = partidx % vector_length;
503 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
504 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
505 const local_ordinal_type num_local_rows = lclrow.extent(0);
508 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
509 subview_1D_right_t bb(
nullptr, blocksize);
510 subview_1D_right_t xx(
nullptr, blocksize);
511 subview_1D_right_t xx_remote(
nullptr, blocksize);
512 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
513 subview_1D_stride_t yy(
nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
514 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
516 const local_ordinal_type lr = lclrow(rowidx);
517 const local_ordinal_type row = lr*blocksize;
518 for (local_ordinal_type col=0;col<num_vectors;++col) {
520 bb.assign_data(&b(row, col));
521 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
522 if (member.team_rank() == 0)
523 VectorCopy(member, blocksize, bb, yy);
524 member.team_barrier();
527 const size_type A_k0 = A_rowptr[lr];
529 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
530 [&](
const local_ordinal_type &k) {
531 const size_type j = A_k0 + colindsub[k];
532 A_block.assign_data( &tpetra_values(j*blocksize_square) );
534 const local_ordinal_type A_colind_at_j = A_colind[j];
535 if (A_colind_at_j < num_local_rows) {
536 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
537 xx.assign_data( &x(loc*blocksize, col) );
538 VectorGemv(member, blocksize, A_block, xx, yy);
540 const auto loc = A_colind_at_j - num_local_rows;
541 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
542 VectorGemv(member, blocksize, A_block, xx_remote, yy);
548 template <
int P,
int B>
struct OverlapTag {};
550 template<
int P,
int B>
552 KOKKOS_INLINE_FUNCTION
554 operator() (
const OverlapTag<P,B> &,
const local_ordinal_type& rowidx)
const {
555 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
556 const local_ordinal_type blocksize_square = blocksize*blocksize;
559 const local_ordinal_type partidx = rowidx2part(rowidx);
560 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
561 const local_ordinal_type v = partidx % vector_length;
563 const local_ordinal_type num_vectors = y_packed.extent(2);
564 const local_ordinal_type num_local_rows = lclrow.extent(0);
567 impl_scalar_type yy[max_blocksize] = {};
569 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
570 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
572 const local_ordinal_type lr = lclrow(rowidx);
573 const local_ordinal_type row = lr*blocksize;
574 for (local_ordinal_type col=0;col<num_vectors;++col) {
577 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
580 memset(yy, 0,
sizeof(impl_scalar_type)*blocksize);
584 const size_type A_k0 = A_rowptr[lr];
585 for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
586 const size_type j = A_k0 + colindsub_used[k];
587 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
588 const local_ordinal_type A_colind_at_j = A_colind[j];
590 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
591 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
592 SerialGemv(blocksize,AA,xx,yy);
594 const auto loc = A_colind_at_j - num_local_rows;
595 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
596 SerialGemv(blocksize,AA,xx_remote,yy);
601 for (local_ordinal_type k=0;k<blocksize;++k)
602 y_packed(pri, k, col)[v] = yy[k];
604 for (local_ordinal_type k=0;k<blocksize;++k)
605 y_packed(pri, k, col)[v] += yy[k];
610 template<
int P,
int B>
611 KOKKOS_INLINE_FUNCTION
613 operator() (
const OverlapTag<P,B> &,
const member_type &member)
const {
614 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
615 const local_ordinal_type blocksize_square = blocksize*blocksize;
618 const local_ordinal_type rowidx = member.league_rank();
619 const local_ordinal_type partidx = rowidx2part(rowidx);
620 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
621 const local_ordinal_type v = partidx % vector_length;
623 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
624 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
625 const local_ordinal_type num_local_rows = lclrow.extent(0);
628 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
629 subview_1D_right_t bb(
nullptr, blocksize);
630 subview_1D_right_t xx(
nullptr, blocksize);
631 subview_1D_right_t xx_remote(
nullptr, blocksize);
632 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
633 subview_1D_stride_t yy(
nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
634 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
635 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
636 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
638 const local_ordinal_type lr = lclrow(rowidx);
639 const local_ordinal_type row = lr*blocksize;
640 for (local_ordinal_type col=0;col<num_vectors;++col) {
641 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
644 bb.assign_data(&b(row, col));
645 if (member.team_rank() == 0)
646 VectorCopy(member, blocksize, bb, yy);
647 member.team_barrier();
651 const size_type A_k0 = A_rowptr[lr];
653 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
654 [&](
const local_ordinal_type &k) {
655 const size_type j = A_k0 + colindsub_used[k];
656 A_block.assign_data( &tpetra_values(j*blocksize_square) );
658 const local_ordinal_type A_colind_at_j = A_colind[j];
660 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
661 xx.assign_data( &x(loc*blocksize, col) );
662 VectorGemv(member, blocksize, A_block, xx, yy);
664 const auto loc = A_colind_at_j - num_local_rows;
665 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
666 VectorGemv(member, blocksize, A_block, xx_remote, yy);
673 template<
typename MultiVectorLocalViewTypeY,
674 typename MultiVectorLocalViewTypeB,
675 typename MultiVectorLocalViewTypeX>
676 void run(
const MultiVectorLocalViewTypeY &y_,
677 const MultiVectorLocalViewTypeB &b_,
678 const MultiVectorLocalViewTypeX &x_) {
679 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
680 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ComputeResidual::<SeqTag>");
682 y = y_; b = b_; x = x_;
683 if constexpr (is_device<execution_space>::value) {
684 const local_ordinal_type blocksize = blocksize_requested;
685 const local_ordinal_type team_size = 8;
686 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
687 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
689 (
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, *
this);
691 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
693 (
"ComputeResidual::RangePolicy::run<SeqTag>", policy, *
this);
695 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
696 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
700 template<
typename MultiVectorLocalViewTypeB,
701 typename MultiVectorLocalViewTypeX,
702 typename MultiVectorLocalViewTypeX_Remote>
703 void run(
const vector_type_3d_view &y_packed_,
704 const MultiVectorLocalViewTypeB &b_,
705 const MultiVectorLocalViewTypeX &x_,
706 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
707 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
708 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ComputeResidual::<AsyncTag>");
710 b = b_; x = x_; x_remote = x_remote_;
711 if constexpr (is_device<execution_space>::value) {
712 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
718 y_packed = y_packed_;
721 if constexpr(is_device<execution_space>::value) {
722 const local_ordinal_type blocksize = blocksize_requested;
723 const local_ordinal_type team_size = 8;
724 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
729 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
730 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
731 policy(rowidx2part.extent(0), team_size, vector_size); \
732 Kokkos::parallel_for \
733 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
734 policy, *this); } break
735 switch (blocksize_requested) {
736 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
737 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
738 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
739 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
740 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
741 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
742 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
743 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
744 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
745 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
747 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
749 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
750 const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
751 Kokkos::parallel_for \
752 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
753 policy, *this); } break
754 switch (blocksize_requested) {
755 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
756 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
757 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
758 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
759 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
760 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
761 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
762 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
763 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
764 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
766 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
768 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
769 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
773 template<
typename MultiVectorLocalViewTypeB,
774 typename MultiVectorLocalViewTypeX,
775 typename MultiVectorLocalViewTypeX_Remote>
776 void run(
const vector_type_3d_view &y_packed_,
777 const MultiVectorLocalViewTypeB &b_,
778 const MultiVectorLocalViewTypeX &x_,
779 const MultiVectorLocalViewTypeX_Remote &x_remote_,
780 const bool compute_owned) {
781 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
782 IFPACK2_BLOCKHELPER_TIMER(
"BlockTriDi::ComputeResidual::<OverlapTag>");
784 b = b_; x = x_; x_remote = x_remote_;
785 if constexpr (is_device<execution_space>::value) {
786 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
792 y_packed = y_packed_;
795 if constexpr (is_device<execution_space>::value) {
796 const local_ordinal_type blocksize = blocksize_requested;
797 const local_ordinal_type team_size = 8;
798 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
803 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
804 if (compute_owned) { \
805 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
806 policy(rowidx2part.extent(0), team_size, vector_size); \
807 Kokkos::parallel_for \
808 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
810 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
811 policy(rowidx2part.extent(0), team_size, vector_size); \
812 Kokkos::parallel_for \
813 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
815 switch (blocksize_requested) {
816 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
817 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
818 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
819 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
820 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
821 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
822 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
823 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
824 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
825 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
827 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
829 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
830 if (compute_owned) { \
831 const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
832 policy(0, rowidx2part.extent(0)); \
833 Kokkos::parallel_for \
834 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
836 const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
837 policy(0, rowidx2part.extent(0)); \
838 Kokkos::parallel_for \
839 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
842 switch (blocksize_requested) {
843 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
844 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
845 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
846 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
847 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
848 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
849 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
850 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
851 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
852 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
854 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
856 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
857 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:317
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:294
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:303
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:359
Definition: Ifpack2_BlockHelper.hpp:290
Definition: Ifpack2_BlockComputeResidualVector.hpp:56