Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockComputeResidualAndSolve.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
12 
13 #include "Ifpack2_BlockHelper.hpp"
14 #include "Ifpack2_BlockComputeResidualVector.hpp"
15 
16 namespace Ifpack2::BlockHelperDetails {
17 
18 template <typename ExecSpace, typename DiagOffsets, typename Rowptrs,
19  typename Entries>
20 DiagOffsets findDiagOffsets(const Rowptrs& rowptrs, const Entries& entries,
21  size_t nrows, int blocksize) {
22  DiagOffsets diag_offsets(do_not_initialize_tag("btdm.diag_offsets"), nrows);
23  int err1 = 0;
24  Kokkos::parallel_reduce(
25  Kokkos::RangePolicy<ExecSpace>(0, nrows),
26  KOKKOS_LAMBDA(size_t row, int& err2) {
27  auto rowBegin = rowptrs(row);
28  auto rowEnd = rowptrs(row + 1);
29  for (size_t j = rowBegin; j < rowEnd; j++) {
30  if (size_t(entries(j)) == row) {
31  diag_offsets(row) = j * blocksize * blocksize;
32  return;
33  }
34  }
35  err2++;
36  },
37  err1);
39  err1, "Ifpack2 BTD: at least one row has no diagonal entry");
40  return diag_offsets;
41 }
42 
43 template <typename MatrixType, int B>
44 struct ComputeResidualAndSolve_1Pass {
45  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
46  using node_device_type = typename impl_type::node_device_type;
47  using execution_space = typename impl_type::execution_space;
48  using memory_space = typename impl_type::memory_space;
49 
50  using local_ordinal_type = typename impl_type::local_ordinal_type;
51  using size_type = typename impl_type::size_type;
52  using impl_scalar_type = typename impl_type::impl_scalar_type;
53  using magnitude_type = typename impl_type::magnitude_type;
55  using local_ordinal_type_1d_view =
56  typename impl_type::local_ordinal_type_1d_view;
57  using size_type_1d_view = typename impl_type::size_type_1d_view;
58  using tpetra_block_access_view_type =
59  typename impl_type::tpetra_block_access_view_type; // block crs (layout
60  // right)
61  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
62  using impl_scalar_type_2d_view_tpetra =
63  typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
64  // (layout left)
65  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
66  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
67  using i64_3d_view = typename impl_type::i64_3d_view;
68 
70  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
71 
72  // enum for max blocksize and vector length
73  enum : int { max_blocksize = 32 };
74 
75  private:
76  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
77  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
78  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
79  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
80 
81  // AmD information
82  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
83 
84  // blocksize
85  const local_ordinal_type blocksize_requested;
86 
87  // block offsets
88  const ConstUnmanaged<i64_3d_view> A_x_offsets;
89  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
90 
91  // diagonal block inverses
92  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
93 
94  // squared update norms
95  const Unmanaged<impl_scalar_type_1d_view> W;
96 
97  impl_scalar_type damping_factor;
98 
99  public:
100  ComputeResidualAndSolve_1Pass(const AmD<MatrixType>& amd,
101  const btdm_scalar_type_3d_view& d_inv_,
102  const impl_scalar_type_1d_view& W_,
103  const local_ordinal_type& blocksize_requested_,
104  const impl_scalar_type& damping_factor_)
105  : tpetra_values(amd.tpetra_values)
106  , blocksize_requested(blocksize_requested_)
107  , A_x_offsets(amd.A_x_offsets)
108  , A_x_offsets_remote(amd.A_x_offsets_remote)
109  , d_inv(d_inv_)
110  , W(W_)
111  , damping_factor(damping_factor_) {}
112 
113  KOKKOS_INLINE_FUNCTION
114  void operator()(const member_type& member) const {
115  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
116  const local_ordinal_type rowidx = member.league_rank();
117  const local_ordinal_type row = rowidx * blocksize;
118  const local_ordinal_type num_vectors = b.extent(1);
119  const local_ordinal_type num_local_rows = d_inv.extent(0);
120 
121  const impl_scalar_type* xx;
122  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
123  tpetra_values.data(), blocksize, blocksize);
124 
125  // Get shared allocation for a local copy of x, residual, and A
126  impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
127  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
128  impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
129  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
130  impl_scalar_type* local_x =
131  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
132  blocksize * sizeof(impl_scalar_type)));
133 
134  magnitude_type norm = 0;
135  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
136  if (col) member.team_barrier();
137  // y -= Rx
138  // Initialize accumulation arrays
139  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
140  [&](const local_ordinal_type& i) {
141  local_Dinv_residual[i] = 0;
142  local_residual[i] = b(row + i, col);
143  });
144  member.team_barrier();
145 
146  int numEntries = A_x_offsets.extent(2);
147 
148  Kokkos::parallel_for(
149  Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
150  int64_t A_offset = A_x_offsets(rowidx, 0, k);
151  int64_t x_offset = A_x_offsets(rowidx, 1, k);
152 #if KOKKOS_VERSION >= 40799
153  if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
154 #else
155  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
156 #endif
157  A_block_cst.assign_data(tpetra_values.data() + A_offset);
158  // Pull x into local memory
159  int64_t remote_cutoff = blocksize * num_local_rows;
160  if (x_offset >= remote_cutoff)
161  xx = &x_remote(x_offset - remote_cutoff, col);
162  else
163  xx = &x(x_offset, col);
164 
165  Kokkos::parallel_for(
166  Kokkos::ThreadVectorRange(member, blocksize),
167  [&](const local_ordinal_type& i) { local_x[i] = xx[i]; });
168 
169  // matvec on block: local_residual -= A_block_cst * local_x
170  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
171  [&](const int k0) {
172  impl_scalar_type val = 0;
173  for (int k1 = 0; k1 < blocksize; k1++)
174  val += A_block_cst(k0, k1) * local_x[k1];
175  Kokkos::atomic_add(local_residual + k0, -val);
176  });
177  }
178  });
179  member.team_barrier();
180  // Compute local_Dinv_residual = D^-1 * local_residual
181  Kokkos::parallel_for(
182  Kokkos::TeamThreadRange(member, blocksize),
183  [&](const local_ordinal_type& k0) {
184  Kokkos::parallel_reduce(
185  Kokkos::ThreadVectorRange(member, blocksize),
186  [&](const local_ordinal_type& k1, impl_scalar_type& update) {
187  update += d_inv(rowidx, k0, k1) * local_residual[k1];
188  },
189  local_Dinv_residual[k0]);
190  });
191  member.team_barrier();
192  // local_Dinv_residual is fully computed. Now compute the
193  // squared y update norm and update y (using damping factor).
194  magnitude_type colNorm;
195  Kokkos::parallel_reduce(
196  Kokkos::TeamVectorRange(member, blocksize),
197  [&](const local_ordinal_type& k, magnitude_type& update) {
198  // Compute the change in y (assuming damping_factor == 1) for this
199  // entry.
200  impl_scalar_type old_y = x(row + k, col);
201  impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
202 #if KOKKOS_VERSION >= 40799
203  if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
204 #else
205  if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
206 #endif
207  magnitude_type ydiff =
208 #if KOKKOS_VERSION >= 40799
209  KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
210 #else
211  Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
212 #endif
213  update += ydiff * ydiff;
214  } else {
215  update += y_update * y_update;
216  }
217  y(row + k, col) = old_y + damping_factor * y_update;
218  },
219  colNorm);
220  norm += colNorm;
221  }
222  Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
223  }
224 
225  // Launch SinglePass version (owned + nonowned residual, plus Dinv in a single
226  // kernel)
227  void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
228  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
229  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
230  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
231  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
232  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
233  "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
234  ComputeResidualAndSolve0, execution_space);
235 
236  y = y_;
237  b = b_;
238  x = x_;
239  x_remote = x_remote_;
240 
241  const local_ordinal_type blocksize = blocksize_requested;
242  const local_ordinal_type nrows = d_inv.extent(0);
243 
244  const local_ordinal_type team_size = 8;
245  const local_ordinal_type vector_size = 8;
246  // team: local_residual, local_Dinv_residual
247  const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
248  // thread: local_x
249  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
250  Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
251  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
252  Kokkos::PerThread(shmem_thread_size));
253  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::SinglePass",
254  policy, *this);
255  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
256  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
257  }
258 };
259 
260 template <typename MatrixType, int B>
261 struct ComputeResidualAndSolve_2Pass {
262  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
263  using node_device_type = typename impl_type::node_device_type;
264  using execution_space = typename impl_type::execution_space;
265  using memory_space = typename impl_type::memory_space;
266 
267  using local_ordinal_type = typename impl_type::local_ordinal_type;
268  using size_type = typename impl_type::size_type;
269  using impl_scalar_type = typename impl_type::impl_scalar_type;
270  using magnitude_type = typename impl_type::magnitude_type;
272  using local_ordinal_type_1d_view =
273  typename impl_type::local_ordinal_type_1d_view;
274  using size_type_1d_view = typename impl_type::size_type_1d_view;
275  using tpetra_block_access_view_type =
276  typename impl_type::tpetra_block_access_view_type; // block crs (layout
277  // right)
278  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
279  using impl_scalar_type_2d_view_tpetra =
280  typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
281  // (layout left)
282  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
283  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
284  using i64_3d_view = typename impl_type::i64_3d_view;
285 
287  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
288 
289  // enum for max blocksize and vector length
290  enum : int { max_blocksize = 32 };
291 
292  // Tag for computing residual with owned columns only (pass 1)
293  struct OwnedTag {};
294 
295  // Tag for finishing the residual with nonowned columns, and solving/norming
296  // (pass 2)
297  struct NonownedTag {};
298 
299  private:
300  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
301  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
302  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
303  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
304 
305  // AmD information
306  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
307 
308  // blocksize
309  const local_ordinal_type blocksize_requested;
310 
311  // block offsets
312  const ConstUnmanaged<i64_3d_view> A_x_offsets;
313  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
314 
315  // diagonal block inverses
316  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
317 
318  // squared update norms
319  const Unmanaged<impl_scalar_type_1d_view> W;
320 
321  impl_scalar_type damping_factor;
322 
323  public:
324  ComputeResidualAndSolve_2Pass(const AmD<MatrixType>& amd,
325  const btdm_scalar_type_3d_view& d_inv_,
326  const impl_scalar_type_1d_view& W_,
327  const local_ordinal_type& blocksize_requested_,
328  const impl_scalar_type& damping_factor_)
329  : tpetra_values(amd.tpetra_values)
330  , blocksize_requested(blocksize_requested_)
331  , A_x_offsets(amd.A_x_offsets)
332  , A_x_offsets_remote(amd.A_x_offsets_remote)
333  , d_inv(d_inv_)
334  , W(W_)
335  , damping_factor(damping_factor_) {}
336 
337  KOKKOS_INLINE_FUNCTION
338  void operator()(const OwnedTag, const member_type& member) const {
339  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
340  const local_ordinal_type rowidx = member.league_rank();
341  const local_ordinal_type row = rowidx * blocksize;
342  const local_ordinal_type num_vectors = b.extent(1);
343 
344  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
345  tpetra_values.data(), blocksize, blocksize);
346 
347  // Get shared allocation for a local copy of x, Ax, and A
348  impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
349  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
350  impl_scalar_type* local_x =
351  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
352  blocksize * sizeof(impl_scalar_type)));
353 
354  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
355  if (col) member.team_barrier();
356  // y -= Rx
357  // Initialize accumulation arrays
358  Kokkos::parallel_for(
359  Kokkos::TeamVectorRange(member, blocksize),
360  [&](const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
361  member.team_barrier();
362 
363  int numEntries = A_x_offsets.extent(2);
364 
365  Kokkos::parallel_for(
366  Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
367  int64_t A_offset = A_x_offsets(rowidx, 0, k);
368  int64_t x_offset = A_x_offsets(rowidx, 1, k);
369 #if KOKKOS_VERSION >= 40799
370  if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
371 #else
372  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
373 #endif
374  A_block_cst.assign_data(tpetra_values.data() + A_offset);
375  // Pull x into local memory
376  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
377  [&](const local_ordinal_type& i) {
378  local_x[i] = x(x_offset + i, col);
379  });
380 
381  // MatVec op Ax += A*x
382  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
383  [&](const local_ordinal_type& k0) {
384  impl_scalar_type val = 0;
385  for (int k1 = 0; k1 < blocksize; k1++)
386  val += A_block_cst(k0, k1) * local_x[k1];
387  Kokkos::atomic_add(local_residual + k0, -val);
388  });
389  }
390  });
391  member.team_barrier();
392  // Write back the partial residual to y
393  if (member.team_rank() == 0) {
394  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
395  [&](const local_ordinal_type& k) {
396  y(row + k, col) = local_residual[k];
397  });
398  }
399  }
400  }
401 
402  KOKKOS_INLINE_FUNCTION
403  void operator()(const NonownedTag, const member_type& member) const {
404  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
405  const local_ordinal_type rowidx = member.league_rank();
406  const local_ordinal_type row = rowidx * blocksize;
407  const local_ordinal_type num_vectors = b.extent(1);
408 
409  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
410  tpetra_values.data(), blocksize, blocksize);
411 
412  // Get shared allocation for a local copy of x, Ax, and A
413  impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
414  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
415  impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
416  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
417  impl_scalar_type* local_x =
418  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
419  blocksize * sizeof(impl_scalar_type)));
420 
421  magnitude_type norm = 0;
422  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
423  if (col) member.team_barrier();
424  // y -= Rx
425  // Initialize accumulation arrays.
426  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
427  [&](const local_ordinal_type& i) {
428  local_Dinv_residual[i] = 0;
429  local_residual[i] = y(row + i, col);
430  });
431  member.team_barrier();
432 
433  int numEntries = A_x_offsets_remote.extent(2);
434 
435  Kokkos::parallel_for(
436  Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
437  int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
438  int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
439 #if KOKKOS_VERSION >= 40799
440  if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
441 #else
442  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
443 #endif
444  A_block_cst.assign_data(tpetra_values.data() + A_offset);
445  // Pull x into local memory
446  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
447  [&](const local_ordinal_type& i) {
448  local_x[i] = x_remote(x_offset + i, col);
449  });
450 
451  // matvec on block: local_residual -= A_block_cst * local_x
452  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
453  [&](const int k0) {
454  impl_scalar_type val = 0;
455  for (int k1 = 0; k1 < blocksize; k1++)
456  val += A_block_cst(k0, k1) * local_x[k1];
457  Kokkos::atomic_add(local_residual + k0, -val);
458  });
459  }
460  });
461  member.team_barrier();
462  // Compute local_Dinv_residual = D^-1 * local_residual
463  Kokkos::parallel_for(
464  Kokkos::TeamThreadRange(member, blocksize),
465  [&](const local_ordinal_type& k0) {
466  Kokkos::parallel_reduce(
467  Kokkos::ThreadVectorRange(member, blocksize),
468  [&](const local_ordinal_type& k1, impl_scalar_type& update) {
469  update += d_inv(rowidx, k0, k1) * local_residual[k1];
470  },
471  local_Dinv_residual[k0]);
472  });
473  member.team_barrier();
474  // local_Dinv_residual is fully computed. Now compute the
475  // squared y update norm and update y (using damping factor).
476  magnitude_type colNorm;
477  Kokkos::parallel_reduce(
478  Kokkos::TeamVectorRange(member, blocksize),
479  [&](const local_ordinal_type& k, magnitude_type& update) {
480  // Compute the change in y (assuming damping_factor == 1) for this
481  // entry.
482  impl_scalar_type old_y = x(row + k, col);
483  impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
484 #if KOKKOS_VERSION >= 40799
485  if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
486 #else
487  if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
488 #endif
489  magnitude_type ydiff =
490 #if KOKKOS_VERSION >= 40799
491  KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
492 #else
493  Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
494 #endif
495  update += ydiff * ydiff;
496  } else {
497  update += y_update * y_update;
498  }
499  y(row + k, col) = old_y + damping_factor * y_update;
500  },
501  colNorm);
502  norm += colNorm;
503  }
504  Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
505  }
506 
507  // Launch pass 1 of the 2-pass version.
508  // This computes just the owned part of residual and writes that back to y.
509  void run_pass1(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
510  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
511  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
512  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
513  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
514  "BlockTriDi::ComputeResidualAndSolve::RunPass1",
515  ComputeResidualAndSolve0, execution_space);
516 
517  b = b_;
518  x = x_;
519  y = y_;
520 
521  const local_ordinal_type blocksize = blocksize_requested;
522  const local_ordinal_type nrows = d_inv.extent(0);
523 
524  const local_ordinal_type team_size = 8;
525  const local_ordinal_type vector_size = 8;
526  const size_t shmem_team_size = blocksize * sizeof(impl_scalar_type);
527  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
528  Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
529  vector_size);
530  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
531  Kokkos::PerThread(shmem_thread_size));
532  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
533  *this);
534  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
535  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
536  }
537 
538  // Launch pass 2 of the 2-pass version.
539  // This finishes computing residual with x_remote,
540  // and then applies Dinv and computes norm.
541  void run_pass2(
542  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
543  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
544  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
545  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
546  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
547  "BlockTriDi::ComputeResidualAndSolve::RunPass2",
548  ComputeResidualAndSolve0, execution_space);
549 
550  x = x_;
551  x_remote = x_remote_;
552  y = y_;
553 
554  const local_ordinal_type blocksize = blocksize_requested;
555  const local_ordinal_type nrows = d_inv.extent(0);
556 
557  const local_ordinal_type team_size = 8;
558  const local_ordinal_type vector_size = 8;
559  const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
560  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
561  Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
562  vector_size);
563  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
564  Kokkos::PerThread(shmem_thread_size));
565  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
566  *this);
567  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
568  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
569  }
570 };
571 
572 template <typename MatrixType, int B>
573 struct ComputeResidualAndSolve_SolveOnly {
574  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
575  using node_device_type = typename impl_type::node_device_type;
576  using execution_space = typename impl_type::execution_space;
577  using memory_space = typename impl_type::memory_space;
578 
579  using local_ordinal_type = typename impl_type::local_ordinal_type;
580  using size_type = typename impl_type::size_type;
581  using impl_scalar_type = typename impl_type::impl_scalar_type;
582  using magnitude_type = typename impl_type::magnitude_type;
584  using local_ordinal_type_1d_view =
585  typename impl_type::local_ordinal_type_1d_view;
586  using size_type_1d_view = typename impl_type::size_type_1d_view;
587  using tpetra_block_access_view_type =
588  typename impl_type::tpetra_block_access_view_type; // block crs (layout
589  // right)
590  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
591  using impl_scalar_type_2d_view_tpetra =
592  typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
593  // (layout left)
594  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
595  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
596  using i64_3d_view = typename impl_type::i64_3d_view;
597 
599  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
600 
601  // enum for max blocksize and vector length
602  enum : int { max_blocksize = 32 };
603 
604  private:
605  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
606  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
607  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
608  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
609 
610  // AmD information
611  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
612 
613  // blocksize
614  const local_ordinal_type blocksize_requested;
615 
616  // block offsets
617  const ConstUnmanaged<i64_3d_view> A_x_offsets;
618  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
619 
620  // diagonal block inverses
621  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
622 
623  // squared update norms
624  const Unmanaged<impl_scalar_type_1d_view> W;
625 
626  impl_scalar_type damping_factor;
627 
628  public:
629  ComputeResidualAndSolve_SolveOnly(
630  const AmD<MatrixType>& amd, const btdm_scalar_type_3d_view& d_inv_,
631  const impl_scalar_type_1d_view& W_,
632  const local_ordinal_type& blocksize_requested_,
633  const impl_scalar_type& damping_factor_)
634  : tpetra_values(amd.tpetra_values)
635  , blocksize_requested(blocksize_requested_)
636  , A_x_offsets(amd.A_x_offsets)
637  , A_x_offsets_remote(amd.A_x_offsets_remote)
638  , d_inv(d_inv_)
639  , W(W_)
640  , damping_factor(damping_factor_) {}
641 
642  KOKKOS_INLINE_FUNCTION
643  void operator()(const member_type& member) const {
644  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
645  const local_ordinal_type rowidx =
646  member.league_rank() * member.team_size() + member.team_rank();
647  const local_ordinal_type row = rowidx * blocksize;
648  const local_ordinal_type num_vectors = b.extent(1);
649 
650  // Get shared allocation for a local copy of x, Ax, and A
651  impl_scalar_type* local_Dinv_residual =
652  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
653  blocksize * sizeof(impl_scalar_type)));
654 
655  if (rowidx >= (local_ordinal_type)d_inv.extent(0)) return;
656 
657  magnitude_type norm = 0;
658  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
659  // Compute local_Dinv_residual = D^-1 * local_residual
660  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
661  [&](const local_ordinal_type& k0) {
662  impl_scalar_type val = 0;
663  for (local_ordinal_type k1 = 0; k1 < blocksize;
664  k1++) {
665  val += d_inv(rowidx, k0, k1) * b(row + k1, col);
666  }
667  local_Dinv_residual[k0] = val;
668  });
669 
670  magnitude_type colNorm;
671  Kokkos::parallel_reduce(
672  Kokkos::ThreadVectorRange(member, blocksize),
673  [&](const local_ordinal_type& k, magnitude_type& update) {
674  // Compute the change in y (assuming damping_factor == 1) for this
675  // entry.
676  impl_scalar_type y_update = local_Dinv_residual[k];
677 #if KOKKOS_VERSION >= 40799
678  if constexpr (KokkosKernels::ArithTraits<impl_scalar_type>::is_complex) {
679 #else
680  if constexpr (Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
681 #endif
682  magnitude_type ydiff =
683 #if KOKKOS_VERSION >= 40799
684  KokkosKernels::ArithTraits<impl_scalar_type>::abs(y_update);
685 #else
686  Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
687 #endif
688  update += ydiff * ydiff;
689  } else {
690  update += y_update * y_update;
691  }
692  y(row + k, col) = damping_factor * y_update;
693  },
694  colNorm);
695  norm += colNorm;
696  }
697  Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
698  }
699 
700  // ComputeResidualAndSolve_SolveOnly::run does the solve for the first
701  // iteration, when the initial guess for y is zero. This means the residual
702  // vector is just b. The kernel applies the inverse diags to b to find y, and
703  // also puts the partial squared update norms (1 per row) into W.
704  void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
705  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
706  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
707  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
708  "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
709  ComputeResidualAndSolve0, execution_space);
710 
711  y = y_;
712  b = b_;
713 
714  const local_ordinal_type blocksize = blocksize_requested;
715  const local_ordinal_type nrows = d_inv.extent(0);
716 
717  const local_ordinal_type team_size = 8;
718  const local_ordinal_type vector_size = 8;
719  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
720  Kokkos::TeamPolicy<execution_space> policy(
721  (nrows + team_size - 1) / team_size, team_size, vector_size);
722  policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
723  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::y_zero", policy,
724  *this);
725  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
726  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
727  }
728 };
729 
730 } // namespace Ifpack2::BlockHelperDetails
731 
732 #endif
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:309
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:286
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:358