10 #ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
13 #include "Ifpack2_BlockHelper.hpp"
14 #include "Ifpack2_BlockComputeResidualVector.hpp"
16 namespace Ifpack2::BlockHelperDetails {
18 template <
typename ExecSpace,
typename DiagOffsets,
typename Rowptrs,
20 DiagOffsets findDiagOffsets(
const Rowptrs& rowptrs,
const Entries& entries,
21 size_t nrows,
int blocksize) {
22 DiagOffsets diag_offsets(do_not_initialize_tag(
"btdm.diag_offsets"), nrows);
24 Kokkos::parallel_reduce(
25 Kokkos::RangePolicy<ExecSpace>(0, nrows),
26 KOKKOS_LAMBDA(
size_t row,
int& err) {
27 auto rowBegin = rowptrs(row);
28 auto rowEnd = rowptrs(row + 1);
29 for (
size_t j = rowBegin; j < rowEnd; j++) {
30 if (
size_t(entries(j)) == row) {
31 diag_offsets(row) = j * blocksize * blocksize;
39 err,
"Ifpack2 BTD: at least one row has no diagonal entry");
43 template <
typename MatrixType,
int B>
44 struct ComputeResidualAndSolve_1Pass {
45 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
47 using execution_space =
typename impl_type::execution_space;
48 using memory_space =
typename impl_type::memory_space;
50 using local_ordinal_type =
typename impl_type::local_ordinal_type;
53 using magnitude_type =
typename impl_type::magnitude_type;
55 using local_ordinal_type_1d_view =
56 typename impl_type::local_ordinal_type_1d_view;
58 using tpetra_block_access_view_type =
59 typename impl_type::tpetra_block_access_view_type;
61 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
62 using impl_scalar_type_2d_view_tpetra =
63 typename impl_type::impl_scalar_type_2d_view_tpetra;
65 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
66 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
67 using i64_3d_view =
typename impl_type::i64_3d_view;
70 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
73 enum :
int { max_blocksize = 32 };
76 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
77 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
78 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
79 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
82 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
85 const local_ordinal_type blocksize_requested;
88 const ConstUnmanaged<i64_3d_view> A_x_offsets;
89 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
92 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
95 const Unmanaged<impl_scalar_type_1d_view> W;
97 impl_scalar_type damping_factor;
100 ComputeResidualAndSolve_1Pass(
const AmD<MatrixType>& amd,
101 const btdm_scalar_type_3d_view& d_inv_,
102 const impl_scalar_type_1d_view& W_,
103 const local_ordinal_type& blocksize_requested_,
104 const impl_scalar_type& damping_factor_)
105 : tpetra_values(amd.tpetra_values),
106 blocksize_requested(blocksize_requested_),
107 A_x_offsets(amd.A_x_offsets),
108 A_x_offsets_remote(amd.A_x_offsets_remote),
111 damping_factor(damping_factor_) {}
113 KOKKOS_INLINE_FUNCTION
114 void operator()(
const member_type& member)
const {
115 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
116 const local_ordinal_type rowidx = member.league_rank();
117 const local_ordinal_type row = rowidx * blocksize;
118 const local_ordinal_type num_vectors = b.extent(1);
119 const local_ordinal_type num_local_rows = d_inv.extent(0);
121 const impl_scalar_type* xx;
122 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
123 NULL, blocksize, blocksize);
126 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
127 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
128 impl_scalar_type* local_Dinv_residual =
reinterpret_cast<impl_scalar_type*
>(
129 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
130 impl_scalar_type* local_x =
131 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
132 blocksize *
sizeof(impl_scalar_type)));
134 magnitude_type norm = 0;
135 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
136 if (col) member.team_barrier();
139 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
140 [&](
const local_ordinal_type& i) {
141 local_Dinv_residual[i] = 0;
142 local_residual[i] = b(row + i, col);
144 member.team_barrier();
146 int numEntries = A_x_offsets.extent(2);
148 Kokkos::parallel_for(
149 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
150 int64_t A_offset = A_x_offsets(rowidx, 0, k);
151 int64_t x_offset = A_x_offsets(rowidx, 1, k);
152 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
153 A_block_cst.assign_data(tpetra_values.data() + A_offset);
155 int64_t remote_cutoff = blocksize * num_local_rows;
156 if (x_offset >= remote_cutoff)
157 xx = &x_remote(x_offset - remote_cutoff, col);
159 xx = &x(x_offset, col);
161 Kokkos::parallel_for(
162 Kokkos::ThreadVectorRange(member, blocksize),
163 [&](
const local_ordinal_type& i) { local_x[i] = xx[i]; });
166 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
168 impl_scalar_type val = 0;
169 for (
int k1 = 0; k1 < blocksize; k1++)
170 val += A_block_cst(k0, k1) * local_x[k1];
171 Kokkos::atomic_add(local_residual + k0, -val);
175 member.team_barrier();
177 Kokkos::parallel_for(
178 Kokkos::TeamThreadRange(member, blocksize),
179 [&](
const local_ordinal_type& k0) {
180 Kokkos::parallel_reduce(
181 Kokkos::ThreadVectorRange(member, blocksize),
182 [&](
const local_ordinal_type& k1, impl_scalar_type& update) {
183 update += d_inv(rowidx, k0, k1) * local_residual[k1];
185 local_Dinv_residual[k0]);
187 member.team_barrier();
190 magnitude_type colNorm;
191 Kokkos::parallel_reduce(
192 Kokkos::TeamVectorRange(member, blocksize),
193 [&](
const local_ordinal_type& k, magnitude_type& update) {
196 impl_scalar_type old_y = x(row + k, col);
197 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
198 if constexpr(Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
199 magnitude_type ydiff =
200 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
201 update += ydiff * ydiff;
204 update += y_update * y_update;
206 y(row + k, col) = old_y + damping_factor * y_update;
211 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
216 void run(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
217 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
218 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
219 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
220 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
221 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
222 "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
223 ComputeResidualAndSolve0, execution_space);
228 x_remote = x_remote_;
230 const local_ordinal_type blocksize = blocksize_requested;
231 const local_ordinal_type nrows = d_inv.extent(0);
233 const local_ordinal_type team_size = 8;
234 const local_ordinal_type vector_size = 8;
236 const size_t shmem_team_size = 2 * blocksize *
sizeof(impl_scalar_type);
238 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
239 Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
240 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
241 Kokkos::PerThread(shmem_thread_size));
242 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::SinglePass",
244 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
245 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
249 template <
typename MatrixType,
int B>
250 struct ComputeResidualAndSolve_2Pass {
251 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
253 using execution_space =
typename impl_type::execution_space;
254 using memory_space =
typename impl_type::memory_space;
256 using local_ordinal_type =
typename impl_type::local_ordinal_type;
259 using magnitude_type =
typename impl_type::magnitude_type;
261 using local_ordinal_type_1d_view =
262 typename impl_type::local_ordinal_type_1d_view;
264 using tpetra_block_access_view_type =
265 typename impl_type::tpetra_block_access_view_type;
267 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
268 using impl_scalar_type_2d_view_tpetra =
269 typename impl_type::impl_scalar_type_2d_view_tpetra;
271 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
272 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
273 using i64_3d_view =
typename impl_type::i64_3d_view;
276 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
279 enum :
int { max_blocksize = 32 };
286 struct NonownedTag {};
289 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
290 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
291 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
292 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
295 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
298 const local_ordinal_type blocksize_requested;
301 const ConstUnmanaged<i64_3d_view> A_x_offsets;
302 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
305 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
308 const Unmanaged<impl_scalar_type_1d_view> W;
310 impl_scalar_type damping_factor;
313 ComputeResidualAndSolve_2Pass(
const AmD<MatrixType>& amd,
314 const btdm_scalar_type_3d_view& d_inv_,
315 const impl_scalar_type_1d_view& W_,
316 const local_ordinal_type& blocksize_requested_,
317 const impl_scalar_type& damping_factor_)
318 : tpetra_values(amd.tpetra_values),
319 blocksize_requested(blocksize_requested_),
320 A_x_offsets(amd.A_x_offsets),
321 A_x_offsets_remote(amd.A_x_offsets_remote),
324 damping_factor(damping_factor_) {}
326 KOKKOS_INLINE_FUNCTION
327 void operator()(
const OwnedTag,
const member_type& member)
const {
328 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
329 const local_ordinal_type rowidx = member.league_rank();
330 const local_ordinal_type row = rowidx * blocksize;
331 const local_ordinal_type num_vectors = b.extent(1);
333 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
334 NULL, blocksize, blocksize);
337 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
338 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
339 impl_scalar_type* local_x =
340 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
341 blocksize *
sizeof(impl_scalar_type)));
343 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
344 if (col) member.team_barrier();
347 Kokkos::parallel_for(
348 Kokkos::TeamVectorRange(member, blocksize),
349 [&](
const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
350 member.team_barrier();
352 int numEntries = A_x_offsets.extent(2);
354 Kokkos::parallel_for(
355 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
356 int64_t A_offset = A_x_offsets(rowidx, 0, k);
357 int64_t x_offset = A_x_offsets(rowidx, 1, k);
358 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
359 A_block_cst.assign_data(tpetra_values.data() + A_offset);
361 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
362 [&](
const local_ordinal_type& i) {
363 local_x[i] = x(x_offset + i, col);
367 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
368 [&](
const local_ordinal_type& k0) {
369 impl_scalar_type val = 0;
370 for (
int k1 = 0; k1 < blocksize; k1++)
371 val += A_block_cst(k0, k1) * local_x[k1];
372 Kokkos::atomic_add(local_residual + k0, -val);
376 member.team_barrier();
378 if (member.team_rank() == 0) {
379 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
380 [&](
const local_ordinal_type& k) {
381 y(row + k, col) = local_residual[k];
387 KOKKOS_INLINE_FUNCTION
388 void operator()(
const NonownedTag,
const member_type& member)
const {
389 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
390 const local_ordinal_type rowidx = member.league_rank();
391 const local_ordinal_type row = rowidx * blocksize;
392 const local_ordinal_type num_vectors = b.extent(1);
394 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
395 NULL, blocksize, blocksize);
398 impl_scalar_type* local_residual =
reinterpret_cast<impl_scalar_type*
>(
399 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
400 impl_scalar_type* local_Dinv_residual =
reinterpret_cast<impl_scalar_type*
>(
401 member.team_scratch(0).get_shmem(blocksize *
sizeof(impl_scalar_type)));
402 impl_scalar_type* local_x =
403 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
404 blocksize *
sizeof(impl_scalar_type)));
406 magnitude_type norm = 0;
407 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
408 if (col) member.team_barrier();
411 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
412 [&](
const local_ordinal_type& i) {
413 local_Dinv_residual[i] = 0;
414 local_residual[i] = y(row + i, col);
416 member.team_barrier();
418 int numEntries = A_x_offsets_remote.extent(2);
420 Kokkos::parallel_for(
421 Kokkos::TeamThreadRange(member, 0, numEntries), [&](
const int k) {
422 int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
423 int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
424 if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
425 A_block_cst.assign_data(tpetra_values.data() + A_offset);
427 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
428 [&](
const local_ordinal_type& i) {
429 local_x[i] = x_remote(x_offset + i, col);
433 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
435 impl_scalar_type val = 0;
436 for (
int k1 = 0; k1 < blocksize; k1++)
437 val += A_block_cst(k0, k1) * local_x[k1];
438 Kokkos::atomic_add(local_residual + k0, -val);
442 member.team_barrier();
444 Kokkos::parallel_for(
445 Kokkos::TeamThreadRange(member, blocksize),
446 [&](
const local_ordinal_type& k0) {
447 Kokkos::parallel_reduce(
448 Kokkos::ThreadVectorRange(member, blocksize),
449 [&](
const local_ordinal_type& k1, impl_scalar_type& update) {
450 update += d_inv(rowidx, k0, k1) * local_residual[k1];
452 local_Dinv_residual[k0]);
454 member.team_barrier();
457 magnitude_type colNorm;
458 Kokkos::parallel_reduce(
459 Kokkos::TeamVectorRange(member, blocksize),
460 [&](
const local_ordinal_type& k, magnitude_type& update) {
463 impl_scalar_type old_y = x(row + k, col);
464 impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
465 if constexpr(Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
466 magnitude_type ydiff =
467 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
468 update += ydiff * ydiff;
471 update += y_update * y_update;
473 y(row + k, col) = old_y + damping_factor * y_update;
478 Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
483 void run_pass1(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
484 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
485 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
486 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
487 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
488 "BlockTriDi::ComputeResidualAndSolve::RunPass1",
489 ComputeResidualAndSolve0, execution_space);
495 const local_ordinal_type blocksize = blocksize_requested;
496 const local_ordinal_type nrows = d_inv.extent(0);
498 const local_ordinal_type team_size = 8;
499 const local_ordinal_type vector_size = 8;
500 const size_t shmem_team_size = blocksize *
sizeof(impl_scalar_type);
501 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
502 Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
504 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
505 Kokkos::PerThread(shmem_thread_size));
506 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
508 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
509 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
516 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
517 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
518 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
519 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
520 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
521 "BlockTriDi::ComputeResidualAndSolve::RunPass2",
522 ComputeResidualAndSolve0, execution_space);
525 x_remote = x_remote_;
528 const local_ordinal_type blocksize = blocksize_requested;
529 const local_ordinal_type nrows = d_inv.extent(0);
531 const local_ordinal_type team_size = 8;
532 const local_ordinal_type vector_size = 8;
533 const size_t shmem_team_size = 2 * blocksize *
sizeof(impl_scalar_type);
534 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
535 Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
537 policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
538 Kokkos::PerThread(shmem_thread_size));
539 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
541 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
542 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
546 template <
typename MatrixType,
int B>
547 struct ComputeResidualAndSolve_SolveOnly {
548 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
550 using execution_space =
typename impl_type::execution_space;
551 using memory_space =
typename impl_type::memory_space;
553 using local_ordinal_type =
typename impl_type::local_ordinal_type;
556 using magnitude_type =
typename impl_type::magnitude_type;
558 using local_ordinal_type_1d_view =
559 typename impl_type::local_ordinal_type_1d_view;
561 using tpetra_block_access_view_type =
562 typename impl_type::tpetra_block_access_view_type;
564 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
565 using impl_scalar_type_2d_view_tpetra =
566 typename impl_type::impl_scalar_type_2d_view_tpetra;
568 using btdm_scalar_type_3d_view =
typename impl_type::btdm_scalar_type_3d_view;
569 using btdm_scalar_type_4d_view =
typename impl_type::btdm_scalar_type_4d_view;
570 using i64_3d_view =
typename impl_type::i64_3d_view;
573 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
576 enum :
int { max_blocksize = 32 };
579 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
580 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;
581 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
582 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
585 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
588 const local_ordinal_type blocksize_requested;
591 const ConstUnmanaged<i64_3d_view> A_x_offsets;
592 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
595 const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
598 const Unmanaged<impl_scalar_type_1d_view> W;
600 impl_scalar_type damping_factor;
603 ComputeResidualAndSolve_SolveOnly(
604 const AmD<MatrixType>& amd,
const btdm_scalar_type_3d_view& d_inv_,
605 const impl_scalar_type_1d_view& W_,
606 const local_ordinal_type& blocksize_requested_,
607 const impl_scalar_type& damping_factor_)
608 : tpetra_values(amd.tpetra_values),
609 blocksize_requested(blocksize_requested_),
610 A_x_offsets(amd.A_x_offsets),
611 A_x_offsets_remote(amd.A_x_offsets_remote),
614 damping_factor(damping_factor_) {}
616 KOKKOS_INLINE_FUNCTION
617 void operator()(
const member_type& member)
const {
618 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
619 const local_ordinal_type rowidx =
620 member.league_rank() * member.team_size() + member.team_rank();
621 const local_ordinal_type row = rowidx * blocksize;
622 const local_ordinal_type num_vectors = b.extent(1);
625 impl_scalar_type* local_Dinv_residual =
626 reinterpret_cast<impl_scalar_type*
>(member.thread_scratch(0).get_shmem(
627 blocksize *
sizeof(impl_scalar_type)));
629 if (rowidx >= (local_ordinal_type)d_inv.extent(0))
return;
631 magnitude_type norm = 0;
632 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
634 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
635 [&](
const local_ordinal_type& k0) {
636 impl_scalar_type val = 0;
637 for (local_ordinal_type k1 = 0; k1 < blocksize;
639 val += d_inv(rowidx, k0, k1) * b(row + k1, col);
641 local_Dinv_residual[k0] = val;
644 magnitude_type colNorm;
645 Kokkos::parallel_reduce(
646 Kokkos::ThreadVectorRange(member, blocksize),
647 [&](
const local_ordinal_type& k, magnitude_type& update) {
650 impl_scalar_type y_update = local_Dinv_residual[k];
651 if constexpr(Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
652 magnitude_type ydiff =
653 Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
654 update += ydiff * ydiff;
657 update += y_update * y_update;
659 y(row + k, col) = damping_factor * y_update;
664 Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
671 void run(
const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
672 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
673 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
674 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
675 "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
676 ComputeResidualAndSolve0, execution_space);
681 const local_ordinal_type blocksize = blocksize_requested;
682 const local_ordinal_type nrows = d_inv.extent(0);
684 const local_ordinal_type team_size = 8;
685 const local_ordinal_type vector_size = 8;
686 const size_t shmem_thread_size = blocksize *
sizeof(impl_scalar_type);
687 Kokkos::TeamPolicy<execution_space> policy(
688 (nrows + team_size - 1) / team_size, team_size, vector_size);
689 policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
690 Kokkos::parallel_for(
"ComputeResidualAndSolve::TeamPolicy::y_zero", policy,
692 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
693 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:276
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:321