10 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
13 #include "Ifpack2_BlockHelper.hpp"
17 namespace BlockHelperDetails {
22 template <
typename MatrixType>
25 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
27 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
29 size_type_1d_view rowptr, rowptr_remote;
36 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
39 bool is_tpetra_block_crs;
42 impl_scalar_type_1d_view_tpetra tpetra_values;
45 AmD(
const AmD &b) =
default;
48 template<
typename MatrixType>
49 struct PartInterface {
50 using local_ordinal_type =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
51 using local_ordinal_type_1d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
52 using local_ordinal_type_2d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
54 PartInterface() =
default;
55 PartInterface(
const PartInterface &b) =
default;
75 local_ordinal_type_1d_view lclrow;
77 local_ordinal_type_1d_view partptr;
78 local_ordinal_type_2d_view partptr_sub;
79 local_ordinal_type_1d_view partptr_schur;
82 local_ordinal_type_1d_view packptr;
83 local_ordinal_type_1d_view packptr_sub;
84 local_ordinal_type_1d_view packindices_sub;
85 local_ordinal_type_2d_view packindices_schur;
88 local_ordinal_type_1d_view part2rowidx0;
89 local_ordinal_type_1d_view part2rowidx0_sub;
93 local_ordinal_type_1d_view part2packrowidx0;
94 local_ordinal_type_2d_view part2packrowidx0_sub;
95 local_ordinal_type part2packrowidx0_back;
97 local_ordinal_type_1d_view rowidx2part;
98 local_ordinal_type_1d_view rowidx2part_sub;
106 local_ordinal_type max_partsz;
107 local_ordinal_type max_subpartsz;
108 local_ordinal_type n_subparts_per_part;
109 local_ordinal_type nparts;
115 static inline int ComputeResidualVectorRecommendedCudaVectorSize(
const int blksize,
116 const int team_size) {
117 int total_team_size(0);
118 if (blksize <= 5) total_team_size = 32;
119 else if (blksize <= 9) total_team_size = 32;
120 else if (blksize <= 12) total_team_size = 96;
121 else if (blksize <= 16) total_team_size = 128;
122 else if (blksize <= 20) total_team_size = 160;
123 else total_team_size = 160;
124 return total_team_size/team_size;
127 static inline int ComputeResidualVectorRecommendedHIPVectorSize(
const int blksize,
128 const int team_size) {
129 int total_team_size(0);
130 if (blksize <= 5) total_team_size = 32;
131 else if (blksize <= 9) total_team_size = 32;
132 else if (blksize <= 12) total_team_size = 96;
133 else if (blksize <= 16) total_team_size = 128;
134 else if (blksize <= 20) total_team_size = 160;
135 else total_team_size = 160;
136 return total_team_size/team_size;
139 static inline int ComputeResidualVectorRecommendedSYCLVectorSize(
const int blksize,
140 const int team_size) {
141 int total_team_size(0);
142 if (blksize <= 5) total_team_size = 32;
143 else if (blksize <= 9) total_team_size = 32;
144 else if (blksize <= 12) total_team_size = 96;
145 else if (blksize <= 16) total_team_size = 128;
146 else if (blksize <= 20) total_team_size = 160;
147 else total_team_size = 160;
148 return total_team_size/team_size;
152 static inline int ComputeResidualVectorRecommendedVectorSize(
const int blksize,
153 const int team_size) {
154 if ( is_cuda<T>::value )
155 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
156 if ( is_hip<T>::value )
157 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
158 if ( is_sycl<T>::value )
159 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
163 template<
typename MatrixType>
164 struct ComputeResidualVector {
166 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
168 using execution_space =
typename impl_type::execution_space;
169 using memory_space =
typename impl_type::memory_space;
171 using local_ordinal_type =
typename impl_type::local_ordinal_type;
174 using magnitude_type =
typename impl_type::magnitude_type;
175 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
176 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
178 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
180 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
181 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
182 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
183 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
184 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
185 static constexpr
int vector_length = impl_type::vector_length;
188 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
191 enum :
int { max_blocksize = 32 };
194 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
195 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
196 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
197 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
198 Unmanaged<vector_type_3d_view> y_packed;
199 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
202 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
203 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
204 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
208 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_block_rowptr;
209 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_point_rowptr;
210 const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
213 const local_ordinal_type blocksize_requested;
216 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
217 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
218 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
219 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
220 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
221 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
222 const bool is_dm2cm_active;
223 const bool hasBlockCrsMatrix;
226 template<
typename LocalCrsGraphType>
227 ComputeResidualVector(
const AmD<MatrixType> &amd,
228 const LocalCrsGraphType &block_graph,
229 const LocalCrsGraphType &point_graph,
230 const local_ordinal_type &blocksize_requested_,
231 const PartInterface<MatrixType> &interf,
232 const local_ordinal_type_1d_view &dm2cm_)
233 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
234 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
235 tpetra_values(amd.tpetra_values),
236 A_block_rowptr(block_graph.row_map),
237 A_point_rowptr(point_graph.row_map),
238 A_colind(block_graph.entries),
239 blocksize_requested(blocksize_requested_),
240 part2packrowidx0(interf.part2packrowidx0),
241 part2rowidx0(interf.part2rowidx0),
242 rowidx2part(interf.rowidx2part),
243 partptr(interf.partptr),
244 lclrow(interf.lclrow),
246 is_dm2cm_active(dm2cm_.span() > 0),
247 hasBlockCrsMatrix(A_block_rowptr.extent(0) == A_point_rowptr.extent(0))
252 SerialDot(
const local_ordinal_type &blocksize,
253 const local_ordinal_type &lclRowID,
254 const local_ordinal_type &lclColID,
255 const local_ordinal_type &ii,
256 const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
257 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
258 impl_scalar_type * KOKKOS_RESTRICT yy)
const {
259 const size_type Aj_c = colindsub_(lclColID);
260 auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
261 impl_scalar_type val = 0;
262 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
265 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
268 for (local_ordinal_type k1=0;k1<blocksize;++k1)
269 val += tpetra_values(point_row_offset + k1) * xx[k1];
275 SerialGemv(
const local_ordinal_type &blocksize,
276 const impl_scalar_type *
const KOKKOS_RESTRICT AA,
277 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
278 impl_scalar_type * KOKKOS_RESTRICT yy)
const {
279 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
280 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
281 impl_scalar_type val = 0;
282 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
285 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
288 for (local_ordinal_type k1=0;k1<blocksize;++k1)
289 val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
294 template<
typename bbViewType,
typename yyViewType>
295 KOKKOS_INLINE_FUNCTION
297 VectorCopy(
const member_type &member,
298 const local_ordinal_type &blocksize,
299 const bbViewType &bb,
300 const yyViewType &yy)
const {
301 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0) {
302 yy(k0) =
static_cast<typename yyViewType::const_value_type
>(bb(k0));
306 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
307 KOKKOS_INLINE_FUNCTION
309 TeamVectorGemv(
const member_type &member,
310 const local_ordinal_type &blocksize,
311 const AAViewType &AA,
312 const xxViewType &xx,
313 const yyViewType &yy)
const {
315 (Kokkos::TeamThreadRange(member, blocksize),
316 [&](
const local_ordinal_type &k0) {
317 impl_scalar_type val = 0;
319 (Kokkos::ThreadVectorRange(member, blocksize),
320 [&](
const local_ordinal_type &k1) {
321 val += AA(k0,k1)*xx(k1);
323 Kokkos::atomic_fetch_add(&yy(k0),
typename yyViewType::const_value_type(-val));
327 template<
typename xxViewType,
typename yyViewType>
328 KOKKOS_INLINE_FUNCTION
330 VectorDot(
const member_type &member,
331 const local_ordinal_type &blocksize,
332 const local_ordinal_type &lclRowID,
333 const local_ordinal_type &lclColID,
334 const local_ordinal_type &ii,
335 const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
336 const xxViewType &xx,
337 const yyViewType &yy)
const {
338 const size_type Aj_c = colindsub_(lclColID);
339 auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
340 impl_scalar_type val = 0;
342 (Kokkos::ThreadVectorRange(member, blocksize),
343 [&](
const local_ordinal_type &k1) {
344 val += tpetra_values(point_row_offset + k1) *xx(k1);
346 Kokkos::atomic_fetch_add(&yy(ii),
typename yyViewType::const_value_type(-val));
349 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
350 KOKKOS_INLINE_FUNCTION
352 VectorGemv(
const member_type &member,
353 const local_ordinal_type &blocksize,
354 const AAViewType &AA,
355 const xxViewType &xx,
356 const yyViewType &yy)
const {
358 (Kokkos::ThreadVectorRange(member, blocksize),
359 [&](
const local_ordinal_type &k0) {
360 impl_scalar_type val(0);
361 for (local_ordinal_type k1=0;k1<blocksize;++k1) {
362 val += AA(k0,k1)*xx(k1);
364 Kokkos::atomic_fetch_add(&yy(k0),
typename yyViewType::const_value_type(-val));
390 KOKKOS_INLINE_FUNCTION
392 operator() (
const SeqTag &,
const local_ordinal_type& i)
const {
393 const local_ordinal_type blocksize = blocksize_requested;
394 const local_ordinal_type blocksize_square = blocksize*blocksize;
397 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
398 const local_ordinal_type num_vectors = y.extent(1);
399 const local_ordinal_type row = i*blocksize;
400 for (local_ordinal_type col=0;col<num_vectors;++col) {
402 impl_scalar_type *yy = &y(row, col);
403 const impl_scalar_type *
const bb = &b(row, col);
404 memcpy(yy, bb,
sizeof(impl_scalar_type)*blocksize);
407 const size_type A_k0 = A_block_rowptr[i];
408 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
409 const size_type j = A_k0 + colindsub[k];
410 const impl_scalar_type *
const xx = &x(A_colind[j]*blocksize, col);
411 if (hasBlockCrsMatrix) {
412 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
413 SerialGemv(blocksize,AA,xx,yy);
416 for (local_ordinal_type k0=0;k0<blocksize;++k0)
417 SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
423 KOKKOS_INLINE_FUNCTION
425 operator() (
const SeqTag &,
const member_type &member)
const {
428 const local_ordinal_type blocksize = blocksize_requested;
429 const local_ordinal_type blocksize_square = blocksize*blocksize;
431 const local_ordinal_type lr = member.league_rank();
432 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
433 const local_ordinal_type num_vectors = y.extent(1);
436 auto bb = Kokkos::subview(b, block_range, 0);
438 auto yy = Kokkos::subview(y, block_range, 0);
439 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
441 const local_ordinal_type row = lr*blocksize;
442 for (local_ordinal_type col=0;col<num_vectors;++col) {
444 yy.assign_data(&y(row, col));
445 bb.assign_data(&b(row, col));
446 if (member.team_rank() == 0)
447 VectorCopy(member, blocksize, bb, yy);
448 member.team_barrier();
451 const size_type A_k0 = A_block_rowptr[lr];
453 if(hasBlockCrsMatrix) {
455 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
456 [&](
const local_ordinal_type &k) {
457 const size_type j = A_k0 + colindsub[k];
458 xx.assign_data( &x(A_colind[j]*blocksize, col) );
459 A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
460 VectorGemv(member, blocksize, A_block_cst, xx, yy);
465 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
466 [&](
const local_ordinal_type &k) {
468 const size_type j = A_k0 + colindsub[k];
469 xx.assign_data( &x(A_colind[j]*blocksize, col) );
471 for (local_ordinal_type k0=0;k0<blocksize;++k0)
472 VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
483 KOKKOS_INLINE_FUNCTION
485 operator() (
const AsyncTag<B> &,
const local_ordinal_type &rowidx)
const {
486 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
487 const local_ordinal_type blocksize_square = blocksize*blocksize;
490 const local_ordinal_type partidx = rowidx2part(rowidx);
491 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
492 const local_ordinal_type v = partidx % vector_length;
494 const local_ordinal_type num_vectors = y_packed.extent(2);
495 const local_ordinal_type num_local_rows = lclrow.extent(0);
498 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
500 const local_ordinal_type lr = lclrow(rowidx);
501 const local_ordinal_type row = lr*blocksize;
502 for (local_ordinal_type col=0;col<num_vectors;++col) {
504 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
507 const size_type A_k0 = A_block_rowptr[lr];
508 for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
509 const size_type j = A_k0 + colindsub[k];
510 const local_ordinal_type A_colind_at_j = A_colind[j];
511 if (hasBlockCrsMatrix) {
512 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
514 if (A_colind_at_j < num_local_rows) {
515 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
516 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
517 SerialGemv(blocksize, AA,xx,yy);
519 const auto loc = A_colind_at_j - num_local_rows;
520 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
521 SerialGemv(blocksize, AA,xx_remote,yy);
525 if (A_colind_at_j < num_local_rows) {
526 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
527 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
528 for (local_ordinal_type k0=0;k0<blocksize;++k0)
529 SerialDot(blocksize, lr, k, k0, colindsub, xx, yy);
531 const auto loc = A_colind_at_j - num_local_rows;
532 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
533 for (local_ordinal_type k0=0;k0<blocksize;++k0)
534 SerialDot(blocksize, lr, k, k0, colindsub, xx_remote, yy);
539 for (local_ordinal_type k=0;k<blocksize;++k)
540 y_packed(pri, k, col)[v] = yy[k];
545 KOKKOS_INLINE_FUNCTION
547 operator() (
const AsyncTag<B> &,
const member_type &member)
const {
548 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
549 const local_ordinal_type blocksize_square = blocksize*blocksize;
552 const local_ordinal_type rowidx = member.league_rank();
553 const local_ordinal_type partidx = rowidx2part(rowidx);
554 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
555 const local_ordinal_type v = partidx % vector_length;
557 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
558 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
559 const local_ordinal_type num_local_rows = lclrow.extent(0);
562 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
563 subview_1D_right_t bb(
nullptr, blocksize);
564 subview_1D_right_t xx(
nullptr, blocksize);
565 subview_1D_right_t xx_remote(
nullptr, blocksize);
566 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
567 subview_1D_stride_t yy(
nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
568 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
570 const local_ordinal_type lr = lclrow(rowidx);
571 const local_ordinal_type row = lr*blocksize;
572 for (local_ordinal_type col=0;col<num_vectors;++col) {
574 bb.assign_data(&b(row, col));
575 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
576 if (member.team_rank() == 0)
577 VectorCopy(member, blocksize, bb, yy);
578 member.team_barrier();
581 const size_type A_k0 = A_block_rowptr[lr];
582 if(hasBlockCrsMatrix) {
584 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
585 [&](
const local_ordinal_type &k) {
586 const size_type j = A_k0 + colindsub[k];
587 A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
589 const local_ordinal_type A_colind_at_j = A_colind[j];
590 if (A_colind_at_j < num_local_rows) {
591 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
592 xx.assign_data( &x(loc*blocksize, col) );
593 VectorGemv(member, blocksize, A_block_cst, xx, yy);
595 const auto loc = A_colind_at_j - num_local_rows;
596 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
597 VectorGemv(member, blocksize, A_block_cst, xx_remote, yy);
603 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
604 [&](
const local_ordinal_type &k) {
605 const size_type j = A_k0 + colindsub[k];
607 const local_ordinal_type A_colind_at_j = A_colind[j];
608 if (A_colind_at_j < num_local_rows) {
609 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
610 xx.assign_data( &x(loc*blocksize, col) );
611 for (local_ordinal_type k0=0;k0<blocksize;++k0)
612 VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
614 const auto loc = A_colind_at_j - num_local_rows;
615 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
616 for (local_ordinal_type k0=0;k0<blocksize;++k0)
617 VectorDot(member, blocksize, lr, k, k0, colindsub, xx_remote, yy);
624 template <
int P,
int B>
struct OverlapTag {};
626 template<
int P,
int B>
628 KOKKOS_INLINE_FUNCTION
630 operator() (
const OverlapTag<P,B> &,
const local_ordinal_type& rowidx)
const {
631 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
632 const local_ordinal_type blocksize_square = blocksize*blocksize;
635 const local_ordinal_type partidx = rowidx2part(rowidx);
636 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
637 const local_ordinal_type v = partidx % vector_length;
639 const local_ordinal_type num_vectors = y_packed.extent(2);
640 const local_ordinal_type num_local_rows = lclrow.extent(0);
643 impl_scalar_type yy[max_blocksize] = {};
645 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
646 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
648 const local_ordinal_type lr = lclrow(rowidx);
649 const local_ordinal_type row = lr*blocksize;
650 for (local_ordinal_type col=0;col<num_vectors;++col) {
653 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
656 memset((
void*) yy, 0,
sizeof(impl_scalar_type)*blocksize);
660 const size_type A_k0 = A_block_rowptr[lr];
661 for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
662 const size_type j = A_k0 + colindsub_used[k];
663 const local_ordinal_type A_colind_at_j = A_colind[j];
664 if (hasBlockCrsMatrix) {
665 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
667 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
668 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
669 SerialGemv(blocksize,AA,xx,yy);
671 const auto loc = A_colind_at_j - num_local_rows;
672 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
673 SerialGemv(blocksize,AA,xx_remote,yy);
678 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
679 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
680 for (local_ordinal_type k0=0;k0<blocksize;++k0)
681 SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
683 const auto loc = A_colind_at_j - num_local_rows;
684 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
685 for (local_ordinal_type k0=0;k0<blocksize;++k0)
686 SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
692 for (local_ordinal_type k=0;k<blocksize;++k)
693 y_packed(pri, k, col)[v] = yy[k];
695 for (local_ordinal_type k=0;k<blocksize;++k)
696 y_packed(pri, k, col)[v] += yy[k];
701 template<
int P,
int B>
702 KOKKOS_INLINE_FUNCTION
704 operator() (
const OverlapTag<P,B> &,
const member_type &member)
const {
705 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
706 const local_ordinal_type blocksize_square = blocksize*blocksize;
709 const local_ordinal_type rowidx = member.league_rank();
710 const local_ordinal_type partidx = rowidx2part(rowidx);
711 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
712 const local_ordinal_type v = partidx % vector_length;
714 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
715 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
716 const local_ordinal_type num_local_rows = lclrow.extent(0);
719 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
720 subview_1D_right_t bb(
nullptr, blocksize);
721 subview_1D_right_t xx(
nullptr, blocksize);
722 subview_1D_right_t xx_remote(
nullptr, blocksize);
723 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
724 subview_1D_stride_t yy(
nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
725 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
726 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
727 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
729 const local_ordinal_type lr = lclrow(rowidx);
730 const local_ordinal_type row = lr*blocksize;
731 for (local_ordinal_type col=0;col<num_vectors;++col) {
732 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
735 bb.assign_data(&b(row, col));
736 if (member.team_rank() == 0)
737 VectorCopy(member, blocksize, bb, yy);
738 member.team_barrier();
742 const size_type A_k0 = A_block_rowptr[lr];
743 if(hasBlockCrsMatrix) {
745 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
746 [&](
const local_ordinal_type &k) {
747 const size_type j = A_k0 + colindsub_used[k];
748 A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
750 const local_ordinal_type A_colind_at_j = A_colind[j];
752 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
753 xx.assign_data( &x(loc*blocksize, col) );
754 VectorGemv(member, blocksize, A_block_cst, xx, yy);
756 const auto loc = A_colind_at_j - num_local_rows;
757 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
758 VectorGemv(member, blocksize, A_block_cst, xx_remote, yy);
764 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
765 [&](
const local_ordinal_type &k) {
766 const size_type j = A_k0 + colindsub_used[k];
768 const local_ordinal_type A_colind_at_j = A_colind[j];
770 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
771 xx.assign_data( &x(loc*blocksize, col) );
772 for (local_ordinal_type k0=0;k0<blocksize;++k0)
773 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
775 const auto loc = A_colind_at_j - num_local_rows;
776 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
777 for (local_ordinal_type k0=0;k0<blocksize;++k0)
778 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
786 template<
typename MultiVectorLocalViewTypeY,
787 typename MultiVectorLocalViewTypeB,
788 typename MultiVectorLocalViewTypeX>
789 void run(
const MultiVectorLocalViewTypeY &y_,
790 const MultiVectorLocalViewTypeB &b_,
791 const MultiVectorLocalViewTypeX &x_) {
792 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
793 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
795 y = y_; b = b_; x = x_;
796 if constexpr (is_device<execution_space>::value) {
797 const local_ordinal_type blocksize = blocksize_requested;
798 const local_ordinal_type team_size = 8;
799 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
800 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
802 (
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, *
this);
804 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
806 (
"ComputeResidual::RangePolicy::run<SeqTag>", policy, *
this);
808 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
809 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
813 template<
typename MultiVectorLocalViewTypeB,
814 typename MultiVectorLocalViewTypeX,
815 typename MultiVectorLocalViewTypeX_Remote>
816 void run(
const vector_type_3d_view &y_packed_,
817 const MultiVectorLocalViewTypeB &b_,
818 const MultiVectorLocalViewTypeX &x_,
819 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
820 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
821 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
823 b = b_; x = x_; x_remote = x_remote_;
824 if constexpr (is_device<execution_space>::value) {
825 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
831 y_packed = y_packed_;
834 if constexpr(is_device<execution_space>::value) {
835 const local_ordinal_type blocksize = blocksize_requested;
836 const local_ordinal_type team_size = 8;
837 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
842 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
843 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
844 policy(rowidx2part.extent(0), team_size, vector_size); \
845 Kokkos::parallel_for \
846 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
847 policy, *this); } break
848 switch (blocksize_requested) {
849 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
850 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
851 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
852 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
853 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
854 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
855 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
856 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
857 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
858 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
860 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
862 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
863 const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
864 Kokkos::parallel_for \
865 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
866 policy, *this); } break
867 switch (blocksize_requested) {
868 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
869 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
870 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
871 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
872 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
873 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
874 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
875 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
876 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
877 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
879 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
881 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
882 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
886 template<
typename MultiVectorLocalViewTypeB,
887 typename MultiVectorLocalViewTypeX,
888 typename MultiVectorLocalViewTypeX_Remote>
889 void run(
const vector_type_3d_view &y_packed_,
890 const MultiVectorLocalViewTypeB &b_,
891 const MultiVectorLocalViewTypeX &x_,
892 const MultiVectorLocalViewTypeX_Remote &x_remote_,
893 const bool compute_owned) {
894 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
895 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
897 b = b_; x = x_; x_remote = x_remote_;
898 if constexpr (is_device<execution_space>::value) {
899 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
905 y_packed = y_packed_;
908 if constexpr (is_device<execution_space>::value) {
909 const local_ordinal_type blocksize = blocksize_requested;
910 const local_ordinal_type team_size = 8;
911 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
916 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
917 if (compute_owned) { \
918 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
919 policy(rowidx2part.extent(0), team_size, vector_size); \
920 Kokkos::parallel_for \
921 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
923 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
924 policy(rowidx2part.extent(0), team_size, vector_size); \
925 Kokkos::parallel_for \
926 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
928 switch (blocksize_requested) {
929 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
930 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
931 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
932 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
933 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
934 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
935 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
936 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
937 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
938 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
940 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
942 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
943 if (compute_owned) { \
944 const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
945 policy(0, rowidx2part.extent(0)); \
946 Kokkos::parallel_for \
947 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
949 const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
950 policy(0, rowidx2part.extent(0)); \
951 Kokkos::parallel_for \
952 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
955 switch (blocksize_requested) {
956 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
957 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
958 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
959 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
960 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
961 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
962 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
963 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
964 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
965 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
967 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
969 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
970 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:276
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:320
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockComputeResidualVector.hpp:23