10 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
13 #include "Ifpack2_BlockHelper.hpp"
17 namespace BlockHelperDetails {
22 template <
typename MatrixType>
25 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
27 using i64_3d_view =
typename impl_type::i64_3d_view;
28 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
30 size_type_1d_view rowptr, rowptr_remote;
37 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
39 i64_3d_view A_x_offsets;
41 i64_3d_view A_x_offsets_remote;
44 bool is_tpetra_block_crs;
47 impl_scalar_type_1d_view_tpetra tpetra_values;
50 AmD(
const AmD &b) =
default;
53 template <
typename MatrixType>
54 struct PartInterface {
55 using local_ordinal_type =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
56 using local_ordinal_type_1d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
57 using local_ordinal_type_2d_view =
typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
59 PartInterface() =
default;
60 PartInterface(
const PartInterface &b) =
default;
80 local_ordinal_type_1d_view lclrow;
82 local_ordinal_type_1d_view partptr;
83 local_ordinal_type_2d_view partptr_sub;
84 local_ordinal_type_1d_view partptr_schur;
87 local_ordinal_type_1d_view packptr;
88 local_ordinal_type_1d_view packptr_sub;
89 local_ordinal_type_1d_view packindices_sub;
90 local_ordinal_type_2d_view packindices_schur;
93 local_ordinal_type_1d_view part2rowidx0;
94 local_ordinal_type_1d_view part2rowidx0_sub;
98 local_ordinal_type_1d_view part2packrowidx0;
99 local_ordinal_type_2d_view part2packrowidx0_sub;
100 local_ordinal_type part2packrowidx0_back;
102 local_ordinal_type_1d_view rowidx2part;
103 local_ordinal_type_1d_view rowidx2part_sub;
111 local_ordinal_type max_partsz;
112 local_ordinal_type max_subpartsz;
113 local_ordinal_type n_subparts_per_part;
114 local_ordinal_type nparts;
124 template <
typename MatrixType>
125 void precompute_A_x_offsets(
126 AmD<MatrixType> &amd,
127 const PartInterface<MatrixType> &interf,
128 const Teuchos::RCP<
const typename ImplType<MatrixType>::tpetra_crs_graph_type> &g,
129 const typename ImplType<MatrixType>::local_ordinal_type_1d_view &dm2cm,
131 bool ownedRemoteSeparate) {
132 using impl_type = ImplType<MatrixType>;
133 using i64_3d_view =
typename impl_type::i64_3d_view;
134 using size_type =
typename impl_type::size_type;
135 using local_ordinal_type =
typename impl_type::local_ordinal_type;
136 using execution_space =
typename impl_type::execution_space;
137 auto local_graph = g->getLocalGraphDevice();
138 const auto A_block_rowptr = local_graph.row_map;
139 const auto A_colind = local_graph.entries;
140 local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
141 int blocksize_square = blocksize * blocksize;
143 auto lclrow = interf.lclrow;
144 auto A_colindsub = amd.A_colindsub;
145 auto A_colindsub_remote = amd.A_colindsub_remote;
146 auto rowptr = amd.rowptr;
147 auto rowptr_remote = amd.rowptr_remote;
148 bool is_dm2cm_active = dm2cm.extent(0);
149 if (ownedRemoteSeparate) {
151 local_ordinal_type maxOwnedEntriesPerRow = 0;
152 local_ordinal_type maxNonownedEntriesPerRow = 0;
153 Kokkos::parallel_reduce(
154 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
155 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmaxOwned, local_ordinal_type & lmaxNonowned) {
156 const local_ordinal_type lr = lclrow(i);
157 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowptr(lr);
158 if (rowNumOwned > lmaxOwned)
159 lmaxOwned = rowNumOwned;
161 if (rowptr_remote.extent(0)) {
162 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowptr_remote(lr);
163 if (rowNumNonowned > lmaxNonowned)
164 lmaxNonowned = rowNumNonowned;
169 Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
173 amd.A_x_offsets = i64_3d_view(
"amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
174 amd.A_x_offsets_remote = i64_3d_view(
"amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
175 auto A_x_offsets = amd.A_x_offsets;
176 auto A_x_offsets_remote = amd.A_x_offsets_remote;
178 Kokkos::parallel_for(
179 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
180 KOKKOS_LAMBDA(local_ordinal_type i) {
181 const local_ordinal_type lr = lclrow(i);
182 const size_type A_k0 = A_block_rowptr(lr);
184 size_type rowBegin = rowptr(lr);
185 local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowBegin;
186 for (local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
187 if (entry < rowNumOwned) {
188 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
189 const local_ordinal_type A_colind_at_j = A_colind(j);
190 const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
191 A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
192 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
194 #if KOKKOS_VERSION >= 40799
195 A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
197 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
199 #if KOKKOS_VERSION >= 40799
200 A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
202 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
207 if (rowptr_remote.extent(0)) {
208 rowBegin = rowptr_remote(lr);
209 local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowBegin;
210 for (local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
211 if (entry < rowNumNonowned) {
212 const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
213 const local_ordinal_type A_colind_at_j = A_colind(j);
214 const local_ordinal_type loc = A_colind_at_j - numLocalRows;
215 A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
216 A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
218 #if KOKKOS_VERSION >= 40799
219 A_x_offsets_remote(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
221 A_x_offsets_remote(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
223 #if KOKKOS_VERSION >= 40799
224 A_x_offsets_remote(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
226 A_x_offsets_remote(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
234 local_ordinal_type maxEntriesPerRow = 0;
235 Kokkos::parallel_reduce(
236 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
237 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmax) {
238 const local_ordinal_type lr = lclrow(i);
239 local_ordinal_type rowNum = rowptr(lr + 1) - rowptr(lr);
243 Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
244 amd.A_x_offsets = i64_3d_view(
"amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
245 auto A_x_offsets = amd.A_x_offsets;
248 Kokkos::parallel_for(
249 Kokkos::RangePolicy<execution_space>(0, numLocalRows),
250 KOKKOS_LAMBDA(local_ordinal_type i) {
251 const local_ordinal_type lr = lclrow(i);
252 const size_type A_k0 = A_block_rowptr(lr);
254 size_type rowBegin = rowptr(lr);
255 local_ordinal_type rowOwned = rowptr(lr + 1) - rowBegin;
256 for (local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
257 if (entry < rowOwned) {
258 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
259 A_x_offsets(i, 0, entry) = j * blocksize_square;
260 const local_ordinal_type A_colind_at_j = A_colind(j);
261 if (A_colind_at_j < numLocalRows) {
262 const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
263 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
265 A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
268 #if KOKKOS_VERSION >= 40799
269 A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
271 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
273 #if KOKKOS_VERSION >= 40799
274 A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
276 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
287 static inline int ComputeResidualVectorRecommendedCudaVectorSize(
const int blksize,
288 const int team_size) {
289 int total_team_size(0);
291 total_team_size = 32;
292 else if (blksize <= 9)
293 total_team_size = 32;
294 else if (blksize <= 12)
295 total_team_size = 96;
296 else if (blksize <= 16)
297 total_team_size = 128;
298 else if (blksize <= 20)
299 total_team_size = 160;
301 total_team_size = 160;
302 return total_team_size / team_size;
305 static inline int ComputeResidualVectorRecommendedHIPVectorSize(
const int blksize,
306 const int team_size) {
307 int total_team_size(0);
309 total_team_size = 32;
310 else if (blksize <= 9)
311 total_team_size = 32;
312 else if (blksize <= 12)
313 total_team_size = 96;
314 else if (blksize <= 16)
315 total_team_size = 128;
316 else if (blksize <= 20)
317 total_team_size = 160;
319 total_team_size = 160;
320 return total_team_size / team_size;
323 static inline int ComputeResidualVectorRecommendedSYCLVectorSize(
const int blksize,
324 const int team_size) {
325 int total_team_size(0);
327 total_team_size = 32;
328 else if (blksize <= 9)
329 total_team_size = 32;
330 else if (blksize <= 12)
331 total_team_size = 96;
332 else if (blksize <= 16)
333 total_team_size = 128;
334 else if (blksize <= 20)
335 total_team_size = 160;
337 total_team_size = 160;
338 return total_team_size / team_size;
341 template <
typename T>
342 static inline int ComputeResidualVectorRecommendedVectorSize(
const int blksize,
343 const int team_size) {
344 if (is_cuda<T>::value)
345 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
346 if (is_hip<T>::value)
347 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
348 if (is_sycl<T>::value)
349 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
353 template <
typename MatrixType>
354 struct ComputeResidualVector {
356 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
358 using execution_space =
typename impl_type::execution_space;
359 using memory_space =
typename impl_type::memory_space;
361 using local_ordinal_type =
typename impl_type::local_ordinal_type;
364 using magnitude_type =
typename impl_type::magnitude_type;
365 using btdm_scalar_type =
typename impl_type::btdm_scalar_type;
366 using btdm_magnitude_type =
typename impl_type::btdm_magnitude_type;
368 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
370 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
371 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
372 using impl_scalar_type_2d_view_tpetra =
typename impl_type::impl_scalar_type_2d_view_tpetra;
373 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
374 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
375 using i64_3d_view =
typename impl_type::i64_3d_view;
376 static constexpr
int vector_length = impl_type::vector_length;
379 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
382 enum :
int { max_blocksize = 32 };
385 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
386 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
387 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
388 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
389 Unmanaged<vector_type_3d_view> y_packed;
390 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
393 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
394 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
395 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
399 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_block_rowptr;
400 const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_point_rowptr;
401 const ConstUnmanaged<Kokkos::View<local_ordinal_type *, node_device_type>> A_colind;
404 const local_ordinal_type blocksize_requested;
407 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
408 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
409 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
410 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
411 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
412 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
415 const ConstUnmanaged<i64_3d_view> A_x_offsets;
416 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
418 const bool is_dm2cm_active;
419 const bool hasBlockCrsMatrix;
422 template <
typename LocalCrsGraphType>
423 ComputeResidualVector(
const AmD<MatrixType> &amd,
424 const LocalCrsGraphType &block_graph,
425 const LocalCrsGraphType &point_graph,
426 const local_ordinal_type &blocksize_requested_,
427 const PartInterface<MatrixType> &interf,
428 const local_ordinal_type_1d_view &dm2cm_,
429 bool hasBlockCrsMatrix_)
431 , rowptr_remote(amd.rowptr_remote)
432 , colindsub(amd.A_colindsub)
433 , colindsub_remote(amd.A_colindsub_remote)
434 , tpetra_values(amd.tpetra_values)
435 , A_block_rowptr(block_graph.row_map)
436 , A_point_rowptr(point_graph.row_map)
437 , A_colind(block_graph.entries)
438 , blocksize_requested(blocksize_requested_)
439 , part2packrowidx0(interf.part2packrowidx0)
440 , part2rowidx0(interf.part2rowidx0)
441 , rowidx2part(interf.rowidx2part)
442 , partptr(interf.partptr)
443 , lclrow(interf.lclrow)
445 , A_x_offsets(amd.A_x_offsets)
446 , A_x_offsets_remote(amd.A_x_offsets_remote)
447 , is_dm2cm_active(dm2cm_.span() > 0)
448 , hasBlockCrsMatrix(hasBlockCrsMatrix_) {}
451 SerialDot(
const local_ordinal_type &blocksize,
452 const local_ordinal_type &lclRowID,
453 const local_ordinal_type &lclColID,
454 const local_ordinal_type &ii,
455 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
456 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
457 impl_scalar_type *KOKKOS_RESTRICT yy)
const {
458 const size_type Aj_c = colindsub_(lclColID);
459 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
460 impl_scalar_type val = 0;
461 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
464 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
467 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
468 val += tpetra_values(point_row_offset + k1) * xx[k1];
473 SerialGemv(
const local_ordinal_type &blocksize,
474 const impl_scalar_type *
const KOKKOS_RESTRICT AA,
475 const impl_scalar_type *
const KOKKOS_RESTRICT xx,
476 impl_scalar_type *KOKKOS_RESTRICT yy)
const {
477 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
478 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
479 impl_scalar_type val = 0;
480 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
483 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
486 for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
487 val += AA[tlb::getFlatIndex(k0, k1, blocksize)] * xx[k1];
492 template <
typename bbViewType,
typename yyViewType>
493 KOKKOS_INLINE_FUNCTION
void
494 VectorCopy(
const member_type &member,
495 const local_ordinal_type &blocksize,
496 const bbViewType &bb,
497 const yyViewType &yy)
const {
498 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0) {
499 yy(k0) =
static_cast<typename yyViewType::const_value_type
>(bb(k0));
503 template <
typename xxViewType,
typename yyViewType>
504 KOKKOS_INLINE_FUNCTION
void
505 VectorDot(
const member_type &member,
506 const local_ordinal_type &blocksize,
507 const local_ordinal_type &lclRowID,
508 const local_ordinal_type &lclColID,
509 const local_ordinal_type &ii,
510 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
511 const xxViewType &xx,
512 const yyViewType &yy)
const {
513 const size_type Aj_c = colindsub_(lclColID);
514 auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
515 impl_scalar_type val = 0;
516 Kokkos::parallel_reduce(
517 Kokkos::ThreadVectorRange(member, blocksize),
518 [&](
const local_ordinal_type &k1, impl_scalar_type &update) {
519 update += tpetra_values(point_row_offset + k1) * xx(k1);
522 Kokkos::single(Kokkos::PerThread(member),
524 Kokkos::atomic_add(&yy(ii),
typename yyViewType::const_value_type(-val));
529 template <
typename AAViewType,
typename xxViewType,
typename yyViewType>
530 KOKKOS_INLINE_FUNCTION
void
531 VectorGemv(
const member_type &member,
532 const local_ordinal_type &blocksize,
533 const AAViewType &AA,
534 const xxViewType &xx,
535 const yyViewType &yy)
const {
536 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
537 impl_scalar_type val = 0;
538 Kokkos::parallel_reduce(
539 Kokkos::ThreadVectorRange(member, blocksize),
540 [&](
const local_ordinal_type &k1, impl_scalar_type &update) {
541 update += AA(k0, k1) * xx(k1);
544 Kokkos::single(Kokkos::PerThread(member),
546 Kokkos::atomic_add(&yy(k0), -val);
553 KOKKOS_INLINE_FUNCTION
555 operator()(
const SeqTag &,
const local_ordinal_type &i)
const {
556 const local_ordinal_type blocksize = blocksize_requested;
557 const local_ordinal_type blocksize_square = blocksize * blocksize;
560 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
561 const local_ordinal_type num_vectors = y.extent(1);
562 const local_ordinal_type row = i * blocksize;
563 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
565 impl_scalar_type *yy = &y(row, col);
566 const impl_scalar_type *
const bb = &b(row, col);
567 memcpy(yy, bb,
sizeof(impl_scalar_type) * blocksize);
570 const size_type A_k0 = A_block_rowptr[i];
571 for (size_type k = rowptr[i]; k < rowptr[i + 1]; ++k) {
572 const size_type j = A_k0 + colindsub[k];
573 const impl_scalar_type *
const xx = &x(A_colind[j] * blocksize, col);
574 if (hasBlockCrsMatrix) {
575 const impl_scalar_type *
const AA = &tpetra_values(j * blocksize_square);
576 SerialGemv(blocksize, AA, xx, yy);
578 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
579 SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
585 KOKKOS_INLINE_FUNCTION
587 operator()(
const SeqTag &,
const member_type &member)
const {
589 const local_ordinal_type blocksize = blocksize_requested;
590 const local_ordinal_type blocksize_square = blocksize * blocksize;
592 const local_ordinal_type lr = member.league_rank();
593 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
594 const local_ordinal_type num_vectors = y.extent(1);
597 auto bb = Kokkos::subview(b, block_range, 0);
599 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
600 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
602 const local_ordinal_type row = lr * blocksize;
603 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
605 yy.assign_data(&y(row, col));
606 bb.assign_data(&b(row, col));
607 if (member.team_rank() == 0)
608 VectorCopy(member, blocksize, bb, yy);
609 member.team_barrier();
612 const size_type A_k0 = A_block_rowptr[lr];
614 if (hasBlockCrsMatrix) {
615 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
616 [&](
const local_ordinal_type &k) {
617 const size_type j = A_k0 + colindsub[k];
618 xx.assign_data(&x(A_colind[j] * blocksize, col));
619 A_block_cst.assign_data(&tpetra_values(j * blocksize_square));
620 VectorGemv(member, blocksize, A_block_cst, xx, yy);
623 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
624 [&](
const local_ordinal_type &k) {
625 const size_type j = A_k0 + colindsub[k];
626 xx.assign_data(&x(A_colind[j] * blocksize, col));
628 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
629 VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
641 template <
int B,
bool async,
bool overlap,
bool haveBlockMatrix>
643 static_assert(!(async && overlap),
644 "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
650 template <
int P,
int B,
bool haveBlockMatrix>
651 using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
653 template <
int B,
bool haveBlockMatrix>
654 using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
657 template <
int B,
bool async,
bool overlap,
bool haveBlockMatrix>
658 KOKKOS_INLINE_FUNCTION
void
659 operator()(
const GeneralTag<B, async, overlap, haveBlockMatrix> &,
const local_ordinal_type &rowidx)
const {
660 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
663 const local_ordinal_type partidx = rowidx2part(rowidx);
664 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
665 const local_ordinal_type v = partidx % vector_length;
667 const local_ordinal_type num_vectors = y_packed.extent(2);
668 const local_ordinal_type num_local_rows = lclrow.extent(0);
671 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
673 const local_ordinal_type lr = lclrow(rowidx);
675 auto colindsub_used = overlap ? colindsub_remote : colindsub;
676 auto rowptr_used = overlap ? rowptr_remote : rowptr;
678 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
679 if constexpr (overlap) {
681 memset((
void *)yy, 0,
sizeof(impl_scalar_type) * blocksize);
684 const local_ordinal_type row = lr * blocksize;
685 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type) * blocksize);
689 const size_type A_k0 = A_block_rowptr[lr];
690 for (size_type k = rowptr_used[lr]; k < rowptr_used[lr + 1]; ++k) {
691 const size_type j = A_k0 + colindsub_used[k];
692 const local_ordinal_type A_colind_at_j = A_colind[j];
693 if constexpr (haveBlockMatrix) {
694 const local_ordinal_type blocksize_square = blocksize * blocksize;
695 const impl_scalar_type *
const AA = &tpetra_values(j * blocksize_square);
696 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
697 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
698 const impl_scalar_type *
const xx = &x(loc * blocksize, col);
699 SerialGemv(blocksize, AA, xx, yy);
701 const auto loc = A_colind_at_j - num_local_rows;
702 const impl_scalar_type *
const xx_remote = &x_remote(loc * blocksize, col);
703 SerialGemv(blocksize, AA, xx_remote, yy);
706 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
707 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
708 const impl_scalar_type *
const xx = &x(loc * blocksize, col);
709 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
710 SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
712 const auto loc = A_colind_at_j - num_local_rows;
713 const impl_scalar_type *
const xx_remote = &x_remote(loc * blocksize, col);
714 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
715 SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
720 if constexpr (overlap) {
721 for (local_ordinal_type k = 0; k < blocksize; ++k)
722 y_packed(pri, k, col)[v] += yy[k];
724 for (local_ordinal_type k = 0; k < blocksize; ++k)
725 y_packed(pri, k, col)[v] = yy[k];
731 template <
int B,
bool async,
bool overlap>
732 KOKKOS_INLINE_FUNCTION
void
733 operator()(
const GeneralTag<B, async, overlap, true> &,
const member_type &member)
const {
734 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
737 const local_ordinal_type rowidx = member.league_rank();
738 const local_ordinal_type partidx = rowidx2part(rowidx);
739 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
740 const local_ordinal_type v = partidx % vector_length;
742 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
743 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
744 const local_ordinal_type num_local_rows = lclrow.extent(0);
747 auto bb = Kokkos::subview(b, block_range, 0);
749 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
750 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
753 impl_scalar_type *local_Ax =
reinterpret_cast<impl_scalar_type *
>(member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
754 impl_scalar_type *local_x =
reinterpret_cast<impl_scalar_type *
>(member.thread_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
756 const local_ordinal_type lr = lclrow(rowidx);
757 const local_ordinal_type row = lr * blocksize;
758 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
760 member.team_barrier();
763 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](
const local_ordinal_type &i) {
766 member.team_barrier();
769 if constexpr (!overlap) {
770 numEntries = A_x_offsets.extent(2);
772 numEntries = A_x_offsets_remote.extent(2);
775 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numEntries),
777 int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
778 int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
779 #if KOKKOS_VERSION >= 40799
780 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
782 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
784 A_block_cst.assign_data(tpetra_values.data() + A_offset);
786 if constexpr (async) {
787 size_type remote_cutoff = blocksize * num_local_rows;
788 if (x_offset >= remote_cutoff)
789 xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
791 xx.assign_data(&x(x_offset, col));
793 if constexpr (!overlap) {
794 xx.assign_data(&x(x_offset, col));
796 xx.assign_data(&x_remote(x_offset, col));
800 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &i) {
805 Kokkos::parallel_for(
806 Kokkos::ThreadVectorRange(member, blocksize),
807 [&](
const local_ordinal_type &k0) {
808 impl_scalar_type val = 0;
809 for (
int k1 = 0; k1 < blocksize; k1++)
810 val += A_block_cst(k0, k1) * local_x[k1];
811 Kokkos::atomic_add(local_Ax + k0, val);
815 member.team_barrier();
817 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
818 bb.assign_data(&b(row, col));
819 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](
const local_ordinal_type &i) {
821 yy(i) = bb(i) - local_Ax[i];
823 yy(i) -= local_Ax[i];
829 template <
int B,
bool async,
bool overlap>
830 KOKKOS_INLINE_FUNCTION
void
831 operator()(
const GeneralTag<B, async, overlap, false> &,
const member_type &member)
const {
832 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
835 const local_ordinal_type rowidx = member.league_rank();
836 const local_ordinal_type partidx = rowidx2part(rowidx);
837 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
838 const local_ordinal_type v = partidx % vector_length;
840 const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
841 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
842 const local_ordinal_type num_local_rows = lclrow.extent(0);
845 auto bb = Kokkos::subview(b, block_range, 0);
848 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
849 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
850 auto colindsub_used = overlap ? colindsub_remote : colindsub;
851 auto rowptr_used = overlap ? rowptr_remote : rowptr;
853 const local_ordinal_type lr = lclrow(rowidx);
854 const local_ordinal_type row = lr * blocksize;
855 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
856 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
859 bb.assign_data(&b(row, col));
860 if (member.team_rank() == 0)
861 VectorCopy(member, blocksize, bb, yy);
862 member.team_barrier();
866 const size_type A_k0 = A_block_rowptr[lr];
867 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr + 1]),
868 [&](
const local_ordinal_type &k) {
869 const size_type j = A_k0 + colindsub_used[k];
870 const local_ordinal_type A_colind_at_j = A_colind[j];
871 if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
872 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
873 xx.assign_data(&x(loc * blocksize, col));
874 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
875 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
877 const auto loc = A_colind_at_j - num_local_rows;
878 xx_remote.assign_data(&x_remote(loc * blocksize, col));
879 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
880 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
887 template <
typename MultiVectorLocalViewTypeY,
888 typename MultiVectorLocalViewTypeB,
889 typename MultiVectorLocalViewTypeX>
890 void run(
const MultiVectorLocalViewTypeY &y_,
891 const MultiVectorLocalViewTypeB &b_,
892 const MultiVectorLocalViewTypeX &x_) {
893 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
894 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
899 if constexpr (is_device<execution_space>::value) {
900 const local_ordinal_type blocksize = blocksize_requested;
901 const local_ordinal_type team_size = 8;
902 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
903 const Kokkos::TeamPolicy<execution_space, SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
904 Kokkos::parallel_for(
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, *
this);
906 const Kokkos::RangePolicy<execution_space, SeqTag> policy(0, rowptr.extent(0) - 1);
907 Kokkos::parallel_for(
"ComputeResidual::RangePolicy::run<SeqTag>", policy, *
this);
909 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
910 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
914 template <
typename MultiVectorLocalViewTypeB,
915 typename MultiVectorLocalViewTypeX,
916 typename MultiVectorLocalViewTypeX_Remote>
917 void run(
const vector_type_3d_view &y_packed_,
918 const MultiVectorLocalViewTypeB &b_,
919 const MultiVectorLocalViewTypeX &x_,
920 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
921 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
922 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
926 x_remote = x_remote_;
927 if constexpr (is_device<execution_space>::value) {
928 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
934 y_packed = y_packed_;
937 if constexpr (is_device<execution_space>::value) {
938 const local_ordinal_type blocksize = blocksize_requested;
943 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
945 if (this->hasBlockCrsMatrix) { \
946 const local_ordinal_type team_size = 8; \
947 const local_ordinal_type vector_size = 8; \
948 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
949 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
950 Kokkos::TeamPolicy<execution_space, AsyncTag<B, true>> \
951 policy(rowidx2part.extent(0), team_size, vector_size); \
952 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
953 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
956 const local_ordinal_type team_size = 8; \
957 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
958 const Kokkos::TeamPolicy<execution_space, AsyncTag<B, false>> \
959 policy(rowidx2part.extent(0), team_size, vector_size); \
960 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
965 switch (blocksize_requested) {
966 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
967 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
968 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
969 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
970 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
971 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
972 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
973 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
974 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
975 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
977 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
979 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
981 if (this->hasBlockCrsMatrix) { \
982 const Kokkos::RangePolicy<execution_space, AsyncTag<B, true>> policy(0, rowidx2part.extent(0)); \
983 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
986 const Kokkos::RangePolicy<execution_space, AsyncTag<B, false>> policy(0, rowidx2part.extent(0)); \
987 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
993 switch (blocksize_requested) {
994 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
995 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
996 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
997 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
998 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
999 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1000 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1001 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1002 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1003 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1005 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1007 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1008 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
1012 template <
typename MultiVectorLocalViewTypeB,
1013 typename MultiVectorLocalViewTypeX,
1014 typename MultiVectorLocalViewTypeX_Remote>
1015 void run(
const vector_type_3d_view &y_packed_,
1016 const MultiVectorLocalViewTypeB &b_,
1017 const MultiVectorLocalViewTypeX &x_,
1018 const MultiVectorLocalViewTypeX_Remote &x_remote_,
1019 const bool compute_owned) {
1020 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
1021 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
"BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
1025 x_remote = x_remote_;
1026 if constexpr (is_device<execution_space>::value) {
1027 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
1028 y_packed_.extent(0),
1029 y_packed_.extent(1),
1030 y_packed_.extent(2),
1033 y_packed = y_packed_;
1036 if constexpr (is_device<execution_space>::value) {
1037 const local_ordinal_type blocksize = blocksize_requested;
1042 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1043 if (this->hasBlockCrsMatrix) { \
1044 const local_ordinal_type team_size = 8; \
1045 const local_ordinal_type vector_size = 8; \
1046 const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
1047 const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
1048 if (compute_owned) { \
1049 Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, true>> \
1050 policy(rowidx2part.extent(0), team_size, vector_size); \
1051 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1052 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1054 Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, true>> \
1055 policy(rowidx2part.extent(0), team_size, vector_size); \
1056 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1057 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1060 const local_ordinal_type team_size = 8; \
1061 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
1062 if (compute_owned) { \
1063 const Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, false>> \
1064 policy(rowidx2part.extent(0), team_size, vector_size); \
1065 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1067 const Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, false>> \
1068 policy(rowidx2part.extent(0), team_size, vector_size); \
1069 Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1073 switch (blocksize_requested) {
1074 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1075 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1076 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1077 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1078 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1079 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1080 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1081 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1082 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1083 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1085 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1087 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1088 if (this->hasBlockCrsMatrix) { \
1089 if (compute_owned) { \
1090 const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, true>> \
1091 policy(0, rowidx2part.extent(0)); \
1092 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1094 const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, true>> \
1095 policy(0, rowidx2part.extent(0)); \
1096 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1099 if (compute_owned) { \
1100 const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, false>> \
1101 policy(0, rowidx2part.extent(0)); \
1102 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1104 const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, false>> \
1105 policy(0, rowidx2part.extent(0)); \
1106 Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1111 switch (blocksize_requested) {
1112 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1113 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1114 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1115 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1116 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1117 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1118 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1119 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1120 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1121 default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1123 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1125 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1126 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:309
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:286
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:358
Definition: Ifpack2_BlockHelper.hpp:270
Definition: Ifpack2_BlockComputeResidualVector.hpp:23