10 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
13 #include "Ifpack2_BlockHelper.hpp"
17 namespace BlockHelperDetails {
22 template <
typename MatrixType>
25 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
27 using i64_3d_view =
typename impl_type::i64_3d_view;
28 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
30 size_type_1d_view rowptr, rowptr_remote;
37 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
39 i64_3d_view A_x_offsets;
41 i64_3d_view A_x_offsets_remote;
44 bool is_tpetra_block_crs;
47 impl_scalar_type_1d_view_tpetra tpetra_values;
50 AmD(
const AmD &b) =
default;
53 template<
typename MatrixType>
54 struct PartInterface {
55 using local_ordinal_type =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
56 using local_ordinal_type_1d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
57 using local_ordinal_type_2d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
59 PartInterface() =
default;
60 PartInterface(
const PartInterface &b) =
default;
80 local_ordinal_type_1d_view lclrow;
82 local_ordinal_type_1d_view partptr;
83 local_ordinal_type_2d_view partptr_sub;
84 local_ordinal_type_1d_view partptr_schur;
87 local_ordinal_type_1d_view packptr;
88 local_ordinal_type_1d_view packptr_sub;
89 local_ordinal_type_1d_view packindices_sub;
90 local_ordinal_type_2d_view packindices_schur;
93 local_ordinal_type_1d_view part2rowidx0;
94 local_ordinal_type_1d_view part2rowidx0_sub;
98 local_ordinal_type_1d_view part2packrowidx0;
99 local_ordinal_type_2d_view part2packrowidx0_sub;
100 local_ordinal_type part2packrowidx0_back;
102 local_ordinal_type_1d_view rowidx2part;
103 local_ordinal_type_1d_view rowidx2part_sub;
111 local_ordinal_type max_partsz;
112 local_ordinal_type max_subpartsz;
113 local_ordinal_type n_subparts_per_part;
114 local_ordinal_type nparts;
124 template<
typename MatrixType>
125 void precompute_A_x_offsets(
126 AmD<MatrixType> &amd,
127 const PartInterface<MatrixType> &interf,
128 const Teuchos::RCP<
const typename ImplType<MatrixType>::tpetra_crs_graph_type> &g,
129 const typename ImplType<MatrixType>::local_ordinal_type_1d_view &dm2cm,
131 bool ownedRemoteSeparate)
133 using impl_type = ImplType<MatrixType>;
134 using i64_3d_view =
typename impl_type::i64_3d_view;
135 using size_type =
typename impl_type::size_type;
136 using local_ordinal_type =
typename impl_type::local_ordinal_type;
137 using execution_space =
typename impl_type::execution_space;
138 auto local_graph = g->getLocalGraphDevice();
139 const auto A_block_rowptr = local_graph.row_map;
140 const auto A_colind = local_graph.entries;
141 local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
142 int blocksize_square = blocksize * blocksize;
144 auto lclrow = interf.lclrow;
145 auto A_colindsub = amd.A_colindsub;
146 auto A_colindsub_remote = amd.A_colindsub_remote;
147 auto rowptr = amd.rowptr;
148 auto rowptr_remote = amd.rowptr_remote;
149 bool is_dm2cm_active = dm2cm.extent(0);
150 if(ownedRemoteSeparate) {
152 local_ordinal_type maxOwnedEntriesPerRow = 0;
153 local_ordinal_type maxNonownedEntriesPerRow = 0;
154 Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
155 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type& lmaxOwned, local_ordinal_type& lmaxNonowned)
157 const local_ordinal_type lr = lclrow(i);
158 local_ordinal_type rowNumOwned = rowptr(lr+1) - rowptr(lr);
159 if(rowNumOwned > lmaxOwned)
160 lmaxOwned = rowNumOwned;
162 if(rowptr_remote.extent(0)) {
163 local_ordinal_type rowNumNonowned = rowptr_remote(lr+1) - rowptr_remote(lr);
164 if(rowNumNonowned > lmaxNonowned)
165 lmaxNonowned = rowNumNonowned;
170 }, Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
174 amd.A_x_offsets = i64_3d_view(
"amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
175 amd.A_x_offsets_remote = i64_3d_view(
"amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
176 auto A_x_offsets = amd.A_x_offsets;
177 auto A_x_offsets_remote = amd.A_x_offsets_remote;
179 Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
180 KOKKOS_LAMBDA(local_ordinal_type i)
182 const local_ordinal_type lr = lclrow(i);
183 const size_type A_k0 = A_block_rowptr(lr);
185 size_type rowBegin = rowptr(lr);
186 local_ordinal_type rowNumOwned = rowptr(lr+1) - rowBegin;
187 for(local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
188 if(entry < rowNumOwned) {
189 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
190 const local_ordinal_type A_colind_at_j = A_colind(j);
191 const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
192 A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
193 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
196 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
197 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
201 if(rowptr_remote.extent(0)) {
202 rowBegin = rowptr_remote(lr);
203 local_ordinal_type rowNumNonowned = rowptr_remote(lr+1) - rowBegin;
204 for(local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
205 if(entry < rowNumNonowned) {
206 const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
207 const local_ordinal_type A_colind_at_j = A_colind(j);
208 const local_ordinal_type loc = A_colind_at_j - numLocalRows;
209 A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
210 A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
213 A_x_offsets_remote(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
214 A_x_offsets_remote(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
222 local_ordinal_type maxEntriesPerRow = 0;
223 Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
224 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type& lmax)
226 const local_ordinal_type lr = lclrow(i);
227 local_ordinal_type rowNum = rowptr(lr+1) - rowptr(lr);
230 }, Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
231 amd.A_x_offsets = i64_3d_view(
"amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
232 auto A_x_offsets = amd.A_x_offsets;
235 Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
236 KOKKOS_LAMBDA(local_ordinal_type i)
238 const local_ordinal_type lr = lclrow(i);
239 const size_type A_k0 = A_block_rowptr(lr);
241 size_type rowBegin = rowptr(lr);
242 local_ordinal_type rowOwned = rowptr(lr+1) - rowBegin;
243 for(local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
244 if(entry < rowOwned) {
245 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
246 A_x_offsets(i, 0, entry) = j * blocksize_square;
247 const local_ordinal_type A_colind_at_j = A_colind(j);
248 if(A_colind_at_j < numLocalRows) {
249 const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
250 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
253 A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
257 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
258 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
268 static inline int ComputeResidualVectorRecommendedCudaVectorSize(
const int blksize,
269 const int team_size) {
270 int total_team_size(0);
271 if (blksize <= 5) total_team_size = 32;
272 else if (blksize <= 9) total_team_size = 32;
273 else if (blksize <= 12) total_team_size = 96;
274 else if (blksize <= 16) total_team_size = 128;
275 else if (blksize <= 20) total_team_size = 160;
276 else total_team_size = 160;
277 return total_team_size/team_size;
280 static inline int ComputeResidualVectorRecommendedHIPVectorSize(
const int blksize,
281 const int team_size) {
282 int total_team_size(0);
283 if (blksize <= 5) total_team_size = 32;
284 else if (blksize <= 9) total_team_size = 32;
285 else if (blksize <= 12) total_team_size = 96;
286 else if (blksize <= 16) total_team_size = 128;
287 else if (blksize <= 20) total_team_size = 160;
288 else total_team_size = 160;
289 return total_team_size/team_size;
292 static inline int ComputeResidualVectorRecommendedSYCLVectorSize(
const int blksize,
293 const int team_size) {
294 int total_team_size(0);
295 if (blksize <= 5) total_team_size = 32;
296 else if (blksize <= 9) total_team_size = 32;
297 else if (blksize <= 12) total_team_size = 96;
298 else if (blksize <= 16) total_team_size = 128;
299 else if (blksize <= 20) total_team_size = 160;
300 else total_team_size = 160;
301 return total_team_size/team_size;
305 static inline int ComputeResidualVectorRecommendedVectorSize(
const int blksize,
306 const int team_size) {
307 if ( is_cuda<T>::value )
308 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
309 if ( is_hip<T>::value )
310 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
311 if ( is_sycl<T>::value )
312 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
316 template<
typename MatrixType>
317 struct ComputeResidualVector {
319 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
321 using execution_space =
typename impl_type::execution_space;
322 using memory_space =
typename impl_type::memory_space;
324 using local_ordinal_type =
typename impl_type::local_ordinal_type;
327 using magnitude_type =
typename impl_type::magnitude_type;
328 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
329 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
331 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
333 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
334 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
335 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
336 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
337 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
338 using i64_3d_view =
typename impl_type::i64_3d_view;
339 static constexpr
int vector_length = impl_type::vector_length;
342 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
345 enum :
int { max_blocksize = 32 };
348 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
349 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
350 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
351 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
352 Unmanaged<vector_type_3d_view> y_packed;
353 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
356 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
357 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
358 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
362 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_block_rowptr;
363 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_point_rowptr;
364 const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
367 const local_ordinal_type blocksize_requested;
370 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
371 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
372 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
373 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
374 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
375 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
378 const ConstUnmanaged<i64_3d_view> A_x_offsets;
379 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
381 const bool is_dm2cm_active;
382 const bool hasBlockCrsMatrix;
385 template<
typename LocalCrsGraphType>
386 ComputeResidualVector(
const AmD<MatrixType> &amd,
387 const LocalCrsGraphType &block_graph,
388 const LocalCrsGraphType &point_graph,
389 const local_ordinal_type &blocksize_requested_,
390 const PartInterface<MatrixType> &interf,
391 const local_ordinal_type_1d_view &dm2cm_,
392 bool hasBlockCrsMatrix_)
393 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
394 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
395 tpetra_values(amd.tpetra_values),
396 A_block_rowptr(block_graph.row_map),
397 A_point_rowptr(point_graph.row_map),
398 A_colind(block_graph.entries),
399 blocksize_requested(blocksize_requested_),
400 part2packrowidx0(interf.part2packrowidx0),
401 part2rowidx0(interf.part2rowidx0),
402 rowidx2part(interf.rowidx2part),
403 partptr(interf.partptr),
404 lclrow(interf.lclrow),
406 A_x_offsets(amd.A_x_offsets),
407 A_x_offsets_remote(amd.A_x_offsets_remote),
408 is_dm2cm_active(dm2cm_.span() > 0),
409 hasBlockCrsMatrix(hasBlockCrsMatrix_)
414 SerialDot(
const local_ordinal_type &blocksize,
415 const local_ordinal_type &lclRowID,
416 const local_ordinal_type &lclColID,
417 const local_ordinal_type &ii,
418 const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
419 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
420 impl_scalar_type * KOKKOS_RESTRICT yy)
const {
421 const size_type Aj_c = colindsub_(lclColID);
422 auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
423 impl_scalar_type val = 0;
424 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
427 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
430 for (local_ordinal_type k1=0;k1<blocksize;++k1)
431 val += tpetra_values(point_row_offset + k1) * xx[k1];
437 SerialGemv(
const local_ordinal_type &blocksize,
438 const impl_scalar_type *
const KOKKOS_RESTRICT AA,
439 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
440 impl_scalar_type * KOKKOS_RESTRICT yy)
const {
441 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
442 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
443 impl_scalar_type val = 0;
444 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
447 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
450 for (local_ordinal_type k1=0;k1<blocksize;++k1)
451 val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
456 template<
typename bbViewType,
typename yyViewType>
457 KOKKOS_INLINE_FUNCTION
459 VectorCopy(
const member_type &member,
460 const local_ordinal_type &blocksize,
461 const bbViewType &bb,
462 const yyViewType &yy)
const {
463 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0) {
464 yy(k0) =
static_cast<typename yyViewType::const_value_type
>(bb(k0));
468 template<
typename xxViewType,
typename yyViewType>
469 KOKKOS_INLINE_FUNCTION
471 VectorDot(
const member_type &member,
472 const local_ordinal_type &blocksize,
473 const local_ordinal_type &lclRowID,
474 const local_ordinal_type &lclColID,
475 const local_ordinal_type &ii,
476 const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
477 const xxViewType &xx,
478 const yyViewType &yy)
const {
479 const size_type Aj_c = colindsub_(lclColID);
480 auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
481 impl_scalar_type val = 0;
482 Kokkos::parallel_reduce
483 (Kokkos::ThreadVectorRange(member, blocksize),
484 [&](
const local_ordinal_type &k1, impl_scalar_type& update) {
485 update += tpetra_values(point_row_offset + k1) *xx(k1);
487 Kokkos::single(Kokkos::PerThread(member),
490 Kokkos::atomic_add(&yy(ii),
typename yyViewType::const_value_type(-val));
495 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
496 KOKKOS_INLINE_FUNCTION
498 VectorGemv(
const member_type &member,
499 const local_ordinal_type &blocksize,
500 const AAViewType &AA,
501 const xxViewType &xx,
502 const yyViewType &yy)
const {
503 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
504 impl_scalar_type val = 0;
505 Kokkos::parallel_reduce
506 (Kokkos::ThreadVectorRange(member, blocksize),
507 [&](
const local_ordinal_type &k1, impl_scalar_type& update) {
508 update += AA(k0,k1)*xx(k1);
510 Kokkos::single(Kokkos::PerThread(member),
513 Kokkos::atomic_add(&yy(k0), -val);
520 KOKKOS_INLINE_FUNCTION
522 operator() (
const SeqTag &,
const local_ordinal_type& i)
const {
523 const local_ordinal_type blocksize = blocksize_requested;
524 const local_ordinal_type blocksize_square = blocksize*blocksize;
527 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
528 const local_ordinal_type num_vectors = y.extent(1);
529 const local_ordinal_type row = i*blocksize;
530 for (local_ordinal_type col=0;col<num_vectors;++col) {
532 impl_scalar_type *yy = &y(row, col);
533 const impl_scalar_type *
const bb = &b(row, col);
534 memcpy(yy, bb,
sizeof(impl_scalar_type)*blocksize);
537 const size_type A_k0 = A_block_rowptr[i];
538 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
539 const size_type j = A_k0 + colindsub[k];
540 const impl_scalar_type *
const xx = &x(A_colind[j]*blocksize, col);
541 if (hasBlockCrsMatrix) {
542 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
543 SerialGemv(blocksize,AA,xx,yy);
546 for (local_ordinal_type k0=0;k0<blocksize;++k0)
547 SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
553 KOKKOS_INLINE_FUNCTION
555 operator() (
const SeqTag &,
const member_type &member)
const {
558 const local_ordinal_type blocksize = blocksize_requested;
559 const local_ordinal_type blocksize_square = blocksize*blocksize;
561 const local_ordinal_type lr = member.league_rank();
562 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
563 const local_ordinal_type num_vectors = y.extent(1);
566 auto bb = Kokkos::subview(b, block_range, 0);
568 auto yy = Kokkos::subview(y, block_range, 0);
569 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
571 const local_ordinal_type row = lr*blocksize;
572 for (local_ordinal_type col=0;col<num_vectors;++col) {
574 yy.assign_data(&y(row, col));
575 bb.assign_data(&b(row, col));
576 if (member.team_rank() == 0)
577 VectorCopy(member, blocksize, bb, yy);
578 member.team_barrier();
581 const size_type A_k0 = A_block_rowptr[lr];
583 if(hasBlockCrsMatrix) {
585 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
586 [&](
const local_ordinal_type &k) {
587 const size_type j = A_k0 + colindsub[k];
588 xx.assign_data( &x(A_colind[j]*blocksize, col) );
589 A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
590 VectorGemv(member, blocksize, A_block_cst, xx, yy);
595 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
596 [&](
const local_ordinal_type &k) {
598 const size_type j = A_k0 + colindsub[k];
599 xx.assign_data( &x(A_colind[j]*blocksize, col) );
601 for (local_ordinal_type k0=0;k0<blocksize;++k0)
602 VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
614 template<
int B,
bool async,
bool overlap,
bool haveBlockMatrix>
617 static_assert(!(async && overlap),
618 "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
624 template<
int P,
int B,
bool haveBlockMatrix>
625 using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
627 template<
int B,
bool haveBlockMatrix>
628 using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
631 template<
int B,
bool async,
bool overlap,
bool haveBlockMatrix>
632 KOKKOS_INLINE_FUNCTION
634 operator() (
const GeneralTag<B, async, overlap, haveBlockMatrix>&,
const local_ordinal_type &rowidx)
const {
635 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
638 const local_ordinal_type partidx = rowidx2part(rowidx);
639 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
640 const local_ordinal_type v = partidx % vector_length;
642 const local_ordinal_type num_vectors = y_packed.extent(2);
643 const local_ordinal_type num_local_rows = lclrow.extent(0);
646 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
648 const local_ordinal_type lr = lclrow(rowidx);
650 auto colindsub_used = overlap ? colindsub_remote : colindsub;
651 auto rowptr_used = overlap ? rowptr_remote : rowptr;
653 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
654 if constexpr(overlap) {
656 memset((
void*) yy, 0,
sizeof(impl_scalar_type)*blocksize);
660 const local_ordinal_type row = lr*blocksize;
661 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
665 const size_type A_k0 = A_block_rowptr[lr];
666 for (size_type k = rowptr_used[lr]; k < rowptr_used[lr+1]; ++k) {
667 const size_type j = A_k0 + colindsub_used[k];
668 const local_ordinal_type A_colind_at_j = A_colind[j];
669 if constexpr (haveBlockMatrix) {
670 const local_ordinal_type blocksize_square = blocksize*blocksize;
671 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
672 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
673 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
674 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
675 SerialGemv(blocksize, AA,xx,yy);
677 const auto loc = A_colind_at_j - num_local_rows;
678 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
679 SerialGemv(blocksize, AA,xx_remote,yy);
683 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
684 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
685 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
686 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
687 SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
689 const auto loc = A_colind_at_j - num_local_rows;
690 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
691 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
692 SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
697 if constexpr(overlap) {
698 for (local_ordinal_type k=0;k<blocksize;++k)
699 y_packed(pri, k, col)[v] += yy[k];
702 for (local_ordinal_type k=0;k<blocksize;++k)
703 y_packed(pri, k, col)[v] = yy[k];
709 template<
int B,
bool async,
bool overlap>
710 KOKKOS_INLINE_FUNCTION
712 operator() (
const GeneralTag<B, async, overlap, true>&,
const member_type &member)
const {
713 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
716 const local_ordinal_type rowidx = member.league_rank();
717 const local_ordinal_type partidx = rowidx2part(rowidx);
718 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
719 const local_ordinal_type v = partidx % vector_length;
721 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
722 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
723 const local_ordinal_type num_local_rows = lclrow.extent(0);
726 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
727 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
728 subview_1D_right_t bb(
nullptr, blocksize);
729 subview_1D_right_t xx(
nullptr, blocksize);
730 subview_1D_stride_t yy(
nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
731 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
734 impl_scalar_type * local_Ax =
reinterpret_cast<impl_scalar_type *
>(member.team_scratch(0).get_shmem(blocksize*
sizeof(impl_scalar_type)));
735 impl_scalar_type * local_x =
reinterpret_cast<impl_scalar_type *
>(member.thread_scratch(0).get_shmem(blocksize*
sizeof(impl_scalar_type)));
737 const local_ordinal_type lr = lclrow(rowidx);
738 const local_ordinal_type row = lr*blocksize;
739 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
741 member.team_barrier();
744 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),[&](
const local_ordinal_type & i){
747 member.team_barrier();
750 if constexpr(!overlap) {
751 numEntries = A_x_offsets.extent(2);
754 numEntries = A_x_offsets_remote.extent(2);
758 (Kokkos::TeamThreadRange(member, 0, numEntries),
760 int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
761 int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
762 if(A_offset != Kokkos::ArithTraits<int64_t>::min()) {
763 A_block_cst.assign_data(tpetra_values.data() + A_offset);
765 if constexpr(async) {
766 size_type remote_cutoff = blocksize * num_local_rows;
767 if(x_offset >= remote_cutoff)
768 xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
770 xx.assign_data(&x(x_offset, col));
773 if constexpr(!overlap) {
774 xx.assign_data(&x(x_offset, col));
777 xx.assign_data(&x_remote(x_offset, col));
781 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),[&](
const local_ordinal_type & i){
786 Kokkos::parallel_for(
787 Kokkos::ThreadVectorRange(member, blocksize),
788 [&](
const local_ordinal_type &k0) {
789 impl_scalar_type val = 0;
790 for(
int k1=0; k1<blocksize; k1++)
791 val += A_block_cst(k0,k1)*local_x[k1];
792 Kokkos::atomic_add(local_Ax+k0, val);
796 member.team_barrier();
798 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
799 bb.assign_data(&b(row, col));
800 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),[&](
const local_ordinal_type & i){
802 yy(i) = bb(i) - local_Ax[i];
804 yy(i) -= local_Ax[i];
810 template<
int B,
bool async,
bool overlap>
811 KOKKOS_INLINE_FUNCTION
813 operator() (
const GeneralTag<B, async, overlap, false>&,
const member_type &member)
const {
814 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
817 const local_ordinal_type rowidx = member.league_rank();
818 const local_ordinal_type partidx = rowidx2part(rowidx);
819 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
820 const local_ordinal_type v = partidx % vector_length;
822 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
823 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
824 const local_ordinal_type num_local_rows = lclrow.extent(0);
827 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
828 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
829 subview_1D_right_t bb(
nullptr, blocksize);
830 subview_1D_right_t xx(
nullptr, blocksize);
831 subview_1D_right_t xx_remote(
nullptr, blocksize);
832 subview_1D_stride_t yy(
nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
833 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
834 auto colindsub_used = overlap ? colindsub_remote : colindsub;
835 auto rowptr_used = overlap ? rowptr_remote : rowptr;
837 const local_ordinal_type lr = lclrow(rowidx);
838 const local_ordinal_type row = lr*blocksize;
839 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
840 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
843 bb.assign_data(&b(row, col));
844 if (member.team_rank() == 0)
845 VectorCopy(member, blocksize, bb, yy);
846 member.team_barrier();
850 const size_type A_k0 = A_block_rowptr[lr];
852 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
853 [&](
const local_ordinal_type &k) {
854 const size_type j = A_k0 + colindsub_used[k];
855 const local_ordinal_type A_colind_at_j = A_colind[j];
856 if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
857 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
858 xx.assign_data( &x(loc*blocksize, col) );
859 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
860 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
862 const auto loc = A_colind_at_j - num_local_rows;
863 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
864 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
865 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
872 template<
typename MultiVectorLocalViewTypeY,
873 typename MultiVectorLocalViewTypeB,
874 typename MultiVectorLocalViewTypeX>
875 void run(
const MultiVectorLocalViewTypeY &y_,
876 const MultiVectorLocalViewTypeB &b_,
877 const MultiVectorLocalViewTypeX &x_) {
878 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
879 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
881 y = y_; b = b_; x = x_;
882 if constexpr (is_device<execution_space>::value) {
883 const local_ordinal_type blocksize = blocksize_requested;
884 const local_ordinal_type team_size = 8;
885 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
886 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
888 (
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, *
this);
890 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
892 (
"ComputeResidual::RangePolicy::run<SeqTag>", policy, *
this);
894 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
895 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
899 template<
typename MultiVectorLocalViewTypeB,
900 typename MultiVectorLocalViewTypeX,
901 typename MultiVectorLocalViewTypeX_Remote>
902 void run(
const vector_type_3d_view &y_packed_,
903 const MultiVectorLocalViewTypeB &b_,
904 const MultiVectorLocalViewTypeX &x_,
905 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
906 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
907 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
909 b = b_; x = x_; x_remote = x_remote_;
910 if constexpr (is_device<execution_space>::value) {
911 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
917 y_packed = y_packed_;
920 if constexpr(is_device<execution_space>::value) {
921 const local_ordinal_type blocksize = blocksize_requested;
926 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
927 if(this->hasBlockCrsMatrix) { \
928 const local_ordinal_type team_size = 8; \
929 const local_ordinal_type vector_size = 8; \
930 const size_t shmem_team_size = blocksize*sizeof(btdm_scalar_type); \
931 const size_t shmem_thread_size = blocksize*sizeof(btdm_scalar_type); \
932 Kokkos::TeamPolicy<execution_space,AsyncTag<B, true> > \
933 policy(rowidx2part.extent(0), team_size, vector_size); \
934 policy.set_scratch_size(0,Kokkos::PerTeam(shmem_team_size),Kokkos::PerThread(shmem_thread_size)); \
935 Kokkos::parallel_for \
936 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
940 const local_ordinal_type team_size = 8; \
941 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
942 const Kokkos::TeamPolicy<execution_space,AsyncTag<B, false> > \
943 policy(rowidx2part.extent(0), team_size, vector_size); \
944 Kokkos::parallel_for \
945 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
949 switch (blocksize_requested) {
950 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
951 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
952 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
953 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
954 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
955 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
956 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
957 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
958 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
959 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
961 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
963 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
964 if(this->hasBlockCrsMatrix) { \
965 const Kokkos::RangePolicy<execution_space,AsyncTag<B, true> > policy(0, rowidx2part.extent(0)); \
966 Kokkos::parallel_for \
967 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
971 const Kokkos::RangePolicy<execution_space,AsyncTag<B, false> > policy(0, rowidx2part.extent(0)); \
972 Kokkos::parallel_for \
973 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
978 switch (blocksize_requested) {
979 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
980 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
981 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
982 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
983 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
984 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
985 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
986 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
987 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
988 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
990 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
992 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
993 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
997 template<
typename MultiVectorLocalViewTypeB,
998 typename MultiVectorLocalViewTypeX,
999 typename MultiVectorLocalViewTypeX_Remote>
1000 void run(
const vector_type_3d_view &y_packed_,
1001 const MultiVectorLocalViewTypeB &b_,
1002 const MultiVectorLocalViewTypeX &x_,
1003 const MultiVectorLocalViewTypeX_Remote &x_remote_,
1004 const bool compute_owned) {
1005 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
1006 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
1008 b = b_; x = x_; x_remote = x_remote_;
1009 if constexpr (is_device<execution_space>::value) {
1010 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
1011 y_packed_.extent(0),
1012 y_packed_.extent(1),
1013 y_packed_.extent(2),
1016 y_packed = y_packed_;
1019 if constexpr (is_device<execution_space>::value) {
1020 const local_ordinal_type blocksize = blocksize_requested;
1025 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1026 if(this->hasBlockCrsMatrix) { \
1027 const local_ordinal_type team_size = 8; \
1028 const local_ordinal_type vector_size = 8; \
1029 const size_t shmem_team_size = blocksize*sizeof(btdm_scalar_type); \
1030 const size_t shmem_thread_size = blocksize*sizeof(btdm_scalar_type); \
1031 if (compute_owned) { \
1032 Kokkos::TeamPolicy<execution_space,OverlapTag<0, B, true> > \
1033 policy(rowidx2part.extent(0), team_size, vector_size); \
1034 policy.set_scratch_size(0,Kokkos::PerTeam(shmem_team_size),Kokkos::PerThread(shmem_thread_size)); \
1035 Kokkos::parallel_for \
1036 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1038 Kokkos::TeamPolicy<execution_space,OverlapTag<1, B, true> > \
1039 policy(rowidx2part.extent(0), team_size, vector_size); \
1040 policy.set_scratch_size(0,Kokkos::PerTeam(shmem_team_size),Kokkos::PerThread(shmem_thread_size)); \
1041 Kokkos::parallel_for \
1042 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1046 const local_ordinal_type team_size = 8; \
1047 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
1048 if (compute_owned) { \
1049 const Kokkos::TeamPolicy<execution_space,OverlapTag<0, B, false> > \
1050 policy(rowidx2part.extent(0), team_size, vector_size); \
1051 Kokkos::parallel_for \
1052 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1054 const Kokkos::TeamPolicy<execution_space,OverlapTag<1, B, false> > \
1055 policy(rowidx2part.extent(0), team_size, vector_size); \
1056 Kokkos::parallel_for \
1057 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1061 switch (blocksize_requested) {
1062 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
1063 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
1064 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
1065 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
1066 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1067 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1068 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1069 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1070 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1071 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
1073 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1075 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1076 if(this->hasBlockCrsMatrix) { \
1077 if (compute_owned) { \
1078 const Kokkos::RangePolicy<execution_space,OverlapTag<0, B, true>> \
1079 policy(0, rowidx2part.extent(0)); \
1080 Kokkos::parallel_for \
1081 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1083 const Kokkos::RangePolicy<execution_space,OverlapTag<1, B, true>> \
1084 policy(0, rowidx2part.extent(0)); \
1085 Kokkos::parallel_for \
1086 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1090 if (compute_owned) { \
1091 const Kokkos::RangePolicy<execution_space,OverlapTag<0, B, false>> \
1092 policy(0, rowidx2part.extent(0)); \
1093 Kokkos::parallel_for \
1094 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1096 const Kokkos::RangePolicy<execution_space,OverlapTag<1, B, false>> \
1097 policy(0, rowidx2part.extent(0)); \
1098 Kokkos::parallel_for \
1099 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1104 switch (blocksize_requested) {
1105 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
1106 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
1107 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
1108 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
1109 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1110 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1111 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1112 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1113 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1114 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
1116 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1118 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1119 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:276
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:320
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockComputeResidualVector.hpp:23