10 #ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
13 #include "Ifpack2_BlockHelper.hpp"
14 #include "Ifpack2_BlockComputeResidualVector.hpp"
16 namespace Ifpack2::BlockHelperDetails {
18 template <
typename ExecSpace,
typename DiagOffsets,
typename Rowptrs,
20 DiagOffsets findDiagOffsets(
const Rowptrs& rowptrs,
const Entries& entries,
21 size_t nrows,
int blocksize) {
22 DiagOffsets diag_offsets(do_not_initialize_tag(
"btdm.diag_offsets"), nrows);
24 Kokkos::parallel_reduce(
25 Kokkos::RangePolicy<ExecSpace>(0, nrows),
26 KOKKOS_LAMBDA(
size_t row,
int& err2) {
27 auto rowBegin = rowptrs(row);
28 auto rowEnd = rowptrs(row + 1);
29 for (
size_t j = rowBegin; j < rowEnd; j++) {
30 if (
size_t(entries(j)) == row) {
31 diag_offsets(row) = j * blocksize * blocksize;
39 err1,
"Ifpack2 BTD: at least one row has no diagonal entry");
43 template <
typename MatrixType,
int B>
44 struct ComputeResidualAndSolve_1Pass {
45 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
47 using execution_space =
typename impl_type::execution_space;
48 using memory_space =
typename impl_type::memory_space;
50 using local_ordinal_type =
typename impl_type::local_ordinal_type;
53 using magnitude_type =
typename impl_type::magnitude_type;
55 using local_ordinal_type_1d_view =
56 typename impl_type::local_ordinal_type_1d_view;
58 using tpetra_block_access_view_type =
59 typename impl_type::tpetra_block_access_view_type;
61 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
62 using impl_scalar_type_2d_view_tpetra =
63 typename impl_type::impl_scalar_type_2d_view_tpetra;
65 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
66 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
67 using i64_3d_view =
typename impl_type::i64_3d_view;
70 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
73 enum :
int { max_blocksize = 32 };
76 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
77 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
78 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
79 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
82 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
85 const local_ordinal_type blocksize_requested;
88 const ConstUnmanaged<i64_3d_view> A_x_offsets;
89 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
92 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
95 const Unmanaged<impl_scalar_type_1d_view> W;
97 impl_scalar_type damping_factor;
100 ComputeResidualAndSolve_1Pass(
const AmD<MatrixType>& amd,
101 const btdm_scalar_type_3d_view& d_inv_,
102 const impl_scalar_type_1d_view& W_,
103 const local_ordinal_type& blocksize_requested_,
104 const impl_scalar_type& damping_factor_)
105 : tpetra_values(amd.tpetra_values)
106 , blocksize_requested(blocksize_requested_)
107 , A_x_offsets(amd.A_x_offsets)
108 , A_x_offsets_remote(amd.A_x_offsets_remote)
111 , damping_factor(damping_factor_) {}
113 KOKKOS_INLINE_FUNCTION
114 void operator()(
const member_type& member)
const {
115 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
116 const local_ordinal_type rowidx = member.league_rank();
117 const local_ordinal_type row = rowidx * blocksize;
118 const local_ordinal_type num_vectors = b.extent(1);
119 const local_ordinal_type num_local_rows = d_inv.extent(0);
121 const impl_scalar_type* xx;
122 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
123 tpetra_values.data(), blocksize, blocksize);
126 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
127 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
128 impl_scalar_type* local_Dinv_residual =
reinterpret_cast<impl_scalar_type*
>(
129 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
130 impl_scalar_type* local_x =
131 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
132 blocksize *
sizeof(impl_scalar_type)));
134 magnitude_type norm = 0;
135 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
136 if (col) member.team_barrier();
139 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
140 [&](
const local_ordinal_type& i) {
141 local_Dinv_residual[i] = 0;
142 local_residual[i] = b(row + i, col);
144 member.team_barrier();
146 int numEntries = A_x_offsets.extent(2);
148 Kokkos::parallel_for(
149 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
150 int64_t A_offset = A_x_offsets(rowidx, 0, k);
151 int64_t x_offset = A_x_offsets(rowidx, 1, k);
152 #if KOKKOS_VERSION >= 40799
153 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
155 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
157 A_block_cst.assign_data(tpetra_values.data() + A_offset);
159 int64_t remote_cutoff = blocksize * num_local_rows;
160 if (x_offset >= remote_cutoff)
161 xx = &x_remote(x_offset - remote_cutoff, col);
163 xx = &x(x_offset, col);
165 Kokkos::parallel_for(
166 Kokkos::ThreadVectorRange(member, blocksize),
167 [&](
const local_ordinal_type& i) { local_x[i] = xx[i]; });
170 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
172 impl_scalar_type val = 0;
173 for (
int k1 = 0; k1 < blocksize; k1++)
174 val += A_block_cst(k0, k1) * local_x[k1];
175 Kokkos::atomic_add(local_residual + k0, -val);
179 member.team_barrier();
181 Kokkos::parallel_for(
182 Kokkos::TeamThreadRange(member, blocksize),
183 [&](
const local_ordinal_type& k0) {
184 Kokkos::parallel_reduce(
185 Kokkos::ThreadVectorRange(member, blocksize),
186 [&](
const local_ordinal_type& k1, impl_scalar_type& update) {
187 update += d_inv(rowidx, k0, k1) * local_residual[k1];
189 local_Dinv_residual[k0]);
191 member.team_barrier();
194 magnitude_type colNorm;
195 Kokkos::parallel_reduce(
196 Kokkos::TeamVectorRange(member, blocksize),
197 [&](
const local_ordinal_type& k, magnitude_type& update) {
200 impl_scalar_type old_y = x(row + k, col);
201 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
202 #if KOKKOS_VERSION >= 40799
203 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
205 if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
207 magnitude_type ydiff =
208 #if KOKKOS_VERSION >= 40799
209 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
211 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
213 update += ydiff * ydiff;
215 update += y_update * y_update;
217 y(row + k, col) = old_y + damping_factor * y_update;
222 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
227 void run(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
228 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
229 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
230 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
231 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
232 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
233 "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
234 ComputeResidualAndSolve0, execution_space);
239 x_remote = x_remote_;
241 const local_ordinal_type blocksize = blocksize_requested;
242 const local_ordinal_type nrows = d_inv.extent(0);
244 const local_ordinal_type team_size = 8;
245 const local_ordinal_type vector_size = 8;
247 const size_t shmem_team_size = 2 * blocksize *
sizeof(impl_scalar_type);
249 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
250 Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
251 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
252 Kokkos::PerThread(shmem_thread_size));
253 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::SinglePass",
255 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
256 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
260 template <
typename MatrixType,
int B>
261 struct ComputeResidualAndSolve_2Pass {
262 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
264 using execution_space =
typename impl_type::execution_space;
265 using memory_space =
typename impl_type::memory_space;
267 using local_ordinal_type =
typename impl_type::local_ordinal_type;
270 using magnitude_type =
typename impl_type::magnitude_type;
272 using local_ordinal_type_1d_view =
273 typename impl_type::local_ordinal_type_1d_view;
275 using tpetra_block_access_view_type =
276 typename impl_type::tpetra_block_access_view_type;
278 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
279 using impl_scalar_type_2d_view_tpetra =
280 typename impl_type::impl_scalar_type_2d_view_tpetra;
282 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
283 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
284 using i64_3d_view =
typename impl_type::i64_3d_view;
287 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
290 enum :
int { max_blocksize = 32 };
297 struct NonownedTag {};
300 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
301 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
302 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
303 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
306 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
309 const local_ordinal_type blocksize_requested;
312 const ConstUnmanaged<i64_3d_view> A_x_offsets;
313 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
316 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
319 const Unmanaged<impl_scalar_type_1d_view> W;
321 impl_scalar_type damping_factor;
324 ComputeResidualAndSolve_2Pass(
const AmD<MatrixType>& amd,
325 const btdm_scalar_type_3d_view& d_inv_,
326 const impl_scalar_type_1d_view& W_,
327 const local_ordinal_type& blocksize_requested_,
328 const impl_scalar_type& damping_factor_)
329 : tpetra_values(amd.tpetra_values)
330 , blocksize_requested(blocksize_requested_)
331 , A_x_offsets(amd.A_x_offsets)
332 , A_x_offsets_remote(amd.A_x_offsets_remote)
335 , damping_factor(damping_factor_) {}
337 KOKKOS_INLINE_FUNCTION
338 void operator()(
const OwnedTag,
const member_type& member)
const {
339 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
340 const local_ordinal_type rowidx = member.league_rank();
341 const local_ordinal_type row = rowidx * blocksize;
342 const local_ordinal_type num_vectors = b.extent(1);
344 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
345 tpetra_values.data(), blocksize, blocksize);
348 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
349 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
350 impl_scalar_type* local_x =
351 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
352 blocksize *
sizeof(impl_scalar_type)));
354 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
355 if (col) member.team_barrier();
358 Kokkos::parallel_for(
359 Kokkos::TeamVectorRange(member, blocksize),
360 [&](
const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
361 member.team_barrier();
363 int numEntries = A_x_offsets.extent(2);
365 Kokkos::parallel_for(
366 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
367 int64_t A_offset = A_x_offsets(rowidx, 0, k);
368 int64_t x_offset = A_x_offsets(rowidx, 1, k);
369 #if KOKKOS_VERSION >= 40799
370 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
372 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
374 A_block_cst.assign_data(tpetra_values.data() + A_offset);
376 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
377 [&](
const local_ordinal_type& i) {
378 local_x[i] = x(x_offset + i, col);
382 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
383 [&](
const local_ordinal_type& k0) {
384 impl_scalar_type val = 0;
385 for (
int k1 = 0; k1 < blocksize; k1++)
386 val += A_block_cst(k0, k1) * local_x[k1];
387 Kokkos::atomic_add(local_residual + k0, -val);
391 member.team_barrier();
393 if (member.team_rank() == 0) {
394 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
395 [&](
const local_ordinal_type& k) {
396 y(row + k, col) = local_residual[k];
402 KOKKOS_INLINE_FUNCTION
403 void operator()(
const NonownedTag,
const member_type& member)
const {
404 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
405 const local_ordinal_type rowidx = member.league_rank();
406 const local_ordinal_type row = rowidx * blocksize;
407 const local_ordinal_type num_vectors = b.extent(1);
409 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
410 tpetra_values.data(), blocksize, blocksize);
413 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
414 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
415 impl_scalar_type* local_Dinv_residual =
reinterpret_cast<impl_scalar_type*
>(
416 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
417 impl_scalar_type* local_x =
418 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
419 blocksize *
sizeof(impl_scalar_type)));
421 magnitude_type norm = 0;
422 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
423 if (col) member.team_barrier();
426 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
427 [&](
const local_ordinal_type& i) {
428 local_Dinv_residual[i] = 0;
429 local_residual[i] = y(row + i, col);
431 member.team_barrier();
433 int numEntries = A_x_offsets_remote.extent(2);
435 Kokkos::parallel_for(
436 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
437 int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
438 int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
439 #if KOKKOS_VERSION >= 40799
440 if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
442 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
444 A_block_cst.assign_data(tpetra_values.data() + A_offset);
446 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
447 [&](
const local_ordinal_type& i) {
448 local_x[i] = x_remote(x_offset + i, col);
452 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
454 impl_scalar_type val = 0;
455 for (
int k1 = 0; k1 < blocksize; k1++)
456 val += A_block_cst(k0, k1) * local_x[k1];
457 Kokkos::atomic_add(local_residual + k0, -val);
461 member.team_barrier();
463 Kokkos::parallel_for(
464 Kokkos::TeamThreadRange(member, blocksize),
465 [&](
const local_ordinal_type& k0) {
466 Kokkos::parallel_reduce(
467 Kokkos::ThreadVectorRange(member, blocksize),
468 [&](
const local_ordinal_type& k1, impl_scalar_type& update) {
469 update += d_inv(rowidx, k0, k1) * local_residual[k1];
471 local_Dinv_residual[k0]);
473 member.team_barrier();
476 magnitude_type colNorm;
477 Kokkos::parallel_reduce(
478 Kokkos::TeamVectorRange(member, blocksize),
479 [&](
const local_ordinal_type& k, magnitude_type& update) {
482 impl_scalar_type old_y = x(row + k, col);
483 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
484 #if KOKKOS_VERSION >= 40799
485 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
487 if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
489 magnitude_type ydiff =
490 #if KOKKOS_VERSION >= 40799
491 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
493 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
495 update += ydiff * ydiff;
497 update += y_update * y_update;
499 y(row + k, col) = old_y + damping_factor * y_update;
504 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
509 void run_pass1(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
510 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
511 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
512 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
513 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
514 "BlockTriDi::ComputeResidualAndSolve::RunPass1",
515 ComputeResidualAndSolve0, execution_space);
521 const local_ordinal_type blocksize = blocksize_requested;
522 const local_ordinal_type nrows = d_inv.extent(0);
524 const local_ordinal_type team_size = 8;
525 const local_ordinal_type vector_size = 8;
526 const size_t shmem_team_size = blocksize *
sizeof(impl_scalar_type);
527 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
528 Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
530 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
531 Kokkos::PerThread(shmem_thread_size));
532 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
534 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
535 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
542 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
543 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
544 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
545 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
546 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
547 "BlockTriDi::ComputeResidualAndSolve::RunPass2",
548 ComputeResidualAndSolve0, execution_space);
551 x_remote = x_remote_;
554 const local_ordinal_type blocksize = blocksize_requested;
555 const local_ordinal_type nrows = d_inv.extent(0);
557 const local_ordinal_type team_size = 8;
558 const local_ordinal_type vector_size = 8;
559 const size_t shmem_team_size = 2 * blocksize *
sizeof(impl_scalar_type);
560 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
561 Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
563 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
564 Kokkos::PerThread(shmem_thread_size));
565 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
567 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
568 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
572 template <
typename MatrixType,
int B>
573 struct ComputeResidualAndSolve_SolveOnly {
574 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
576 using execution_space =
typename impl_type::execution_space;
577 using memory_space =
typename impl_type::memory_space;
579 using local_ordinal_type =
typename impl_type::local_ordinal_type;
582 using magnitude_type =
typename impl_type::magnitude_type;
584 using local_ordinal_type_1d_view =
585 typename impl_type::local_ordinal_type_1d_view;
587 using tpetra_block_access_view_type =
588 typename impl_type::tpetra_block_access_view_type;
590 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
591 using impl_scalar_type_2d_view_tpetra =
592 typename impl_type::impl_scalar_type_2d_view_tpetra;
594 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
595 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
596 using i64_3d_view =
typename impl_type::i64_3d_view;
599 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
602 enum :
int { max_blocksize = 32 };
605 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
606 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
607 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
608 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
611 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
614 const local_ordinal_type blocksize_requested;
617 const ConstUnmanaged<i64_3d_view> A_x_offsets;
618 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
621 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
624 const Unmanaged<impl_scalar_type_1d_view> W;
626 impl_scalar_type damping_factor;
629 ComputeResidualAndSolve_SolveOnly(
630 const AmD<MatrixType>& amd,
const btdm_scalar_type_3d_view& d_inv_,
631 const impl_scalar_type_1d_view& W_,
632 const local_ordinal_type& blocksize_requested_,
633 const impl_scalar_type& damping_factor_)
634 : tpetra_values(amd.tpetra_values)
635 , blocksize_requested(blocksize_requested_)
636 , A_x_offsets(amd.A_x_offsets)
637 , A_x_offsets_remote(amd.A_x_offsets_remote)
640 , damping_factor(damping_factor_) {}
642 KOKKOS_INLINE_FUNCTION
643 void operator()(
const member_type& member)
const {
644 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
645 const local_ordinal_type rowidx =
646 member.league_rank() * member.team_size() + member.team_rank();
647 const local_ordinal_type row = rowidx * blocksize;
648 const local_ordinal_type num_vectors = b.extent(1);
651 impl_scalar_type* local_Dinv_residual =
652 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
653 blocksize *
sizeof(impl_scalar_type)));
655 if (rowidx >= (local_ordinal_type)d_inv.extent(0))
return;
657 magnitude_type norm = 0;
658 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
660 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
661 [&](
const local_ordinal_type& k0) {
662 impl_scalar_type val = 0;
663 for (local_ordinal_type k1 = 0; k1 < blocksize;
665 val += d_inv(rowidx, k0, k1) * b(row + k1, col);
667 local_Dinv_residual[k0] = val;
670 magnitude_type colNorm;
671 Kokkos::parallel_reduce(
672 Kokkos::ThreadVectorRange(member, blocksize),
673 [&](
const local_ordinal_type& k, magnitude_type& update) {
676 impl_scalar_type y_update = local_Dinv_residual[k];
677 #if KOKKOS_VERSION >= 40799
678 if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
680 if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
682 magnitude_type ydiff =
683 #if KOKKOS_VERSION >= 40799
684 KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
686 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
688 update += ydiff * ydiff;
690 update += y_update * y_update;
692 y(row + k, col) = damping_factor * y_update;
697 Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
704 void run(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
705 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
706 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
707 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
708 "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
709 ComputeResidualAndSolve0, execution_space);
714 const local_ordinal_type blocksize = blocksize_requested;
715 const local_ordinal_type nrows = d_inv.extent(0);
717 const local_ordinal_type team_size = 8;
718 const local_ordinal_type vector_size = 8;
719 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
720 Kokkos::TeamPolicy<execution_space> policy(
721 (nrows + team_size - 1) / team_size, team_size, vector_size);
722 policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
723 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::y_zero", policy,
725 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
726 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:309
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:286
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:358