Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockComputeResidualAndSolve.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
12 
13 #include "Ifpack2_BlockHelper.hpp"
14 #include "Ifpack2_BlockComputeResidualVector.hpp"
15 
16 namespace Ifpack2::BlockHelperDetails {
17 
18 template <typename ExecSpace, typename DiagOffsets, typename Rowptrs,
19  typename Entries>
20 DiagOffsets findDiagOffsets(const Rowptrs& rowptrs, const Entries& entries,
21  size_t nrows, int blocksize) {
22  DiagOffsets diag_offsets(do_not_initialize_tag("btdm.diag_offsets"), nrows);
23  int err = 0;
24  Kokkos::parallel_reduce(
25  Kokkos::RangePolicy<ExecSpace>(0, nrows),
26  KOKKOS_LAMBDA(size_t row, int& err) {
27  auto rowBegin = rowptrs(row);
28  auto rowEnd = rowptrs(row + 1);
29  for (size_t j = rowBegin; j < rowEnd; j++) {
30  if (size_t(entries(j)) == row) {
31  diag_offsets(row) = j * blocksize * blocksize;
32  return;
33  }
34  }
35  err++;
36  },
37  err);
39  err, "Ifpack2 BTD: at least one row has no diagonal entry");
40  return diag_offsets;
41 }
42 
43 template <typename MatrixType, int B>
44 struct ComputeResidualAndSolve_1Pass {
45  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
46  using node_device_type = typename impl_type::node_device_type;
47  using execution_space = typename impl_type::execution_space;
48  using memory_space = typename impl_type::memory_space;
49 
50  using local_ordinal_type = typename impl_type::local_ordinal_type;
51  using size_type = typename impl_type::size_type;
52  using impl_scalar_type = typename impl_type::impl_scalar_type;
53  using magnitude_type = typename impl_type::magnitude_type;
55  using local_ordinal_type_1d_view =
56  typename impl_type::local_ordinal_type_1d_view;
57  using size_type_1d_view = typename impl_type::size_type_1d_view;
58  using tpetra_block_access_view_type =
59  typename impl_type::tpetra_block_access_view_type; // block crs (layout
60  // right)
61  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
62  using impl_scalar_type_2d_view_tpetra =
63  typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
64  // (layout left)
65  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
66  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
67  using i64_3d_view = typename impl_type::i64_3d_view;
68 
70  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
71 
72  // enum for max blocksize and vector length
73  enum : int { max_blocksize = 32 };
74 
75  private:
76  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
77  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
78  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
79  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
80 
81  // AmD information
82  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
83 
84  // blocksize
85  const local_ordinal_type blocksize_requested;
86 
87  // block offsets
88  const ConstUnmanaged<i64_3d_view> A_x_offsets;
89  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
90 
91  // diagonal block inverses
92  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
93 
94  // squared update norms
95  const Unmanaged<impl_scalar_type_1d_view> W;
96 
97  impl_scalar_type damping_factor;
98 
99  public:
100  ComputeResidualAndSolve_1Pass(const AmD<MatrixType>& amd,
101  const btdm_scalar_type_3d_view& d_inv_,
102  const impl_scalar_type_1d_view& W_,
103  const local_ordinal_type& blocksize_requested_,
104  const impl_scalar_type& damping_factor_)
105  : tpetra_values(amd.tpetra_values),
106  blocksize_requested(blocksize_requested_),
107  A_x_offsets(amd.A_x_offsets),
108  A_x_offsets_remote(amd.A_x_offsets_remote),
109  d_inv(d_inv_),
110  W(W_),
111  damping_factor(damping_factor_) {}
112 
113  KOKKOS_INLINE_FUNCTION
114  void operator()(const member_type& member) const {
115  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
116  const local_ordinal_type rowidx = member.league_rank();
117  const local_ordinal_type row = rowidx * blocksize;
118  const local_ordinal_type num_vectors = b.extent(1);
119  const local_ordinal_type num_local_rows = d_inv.extent(0);
120 
121  const impl_scalar_type* xx;
122  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
123  NULL, blocksize, blocksize);
124 
125  // Get shared allocation for a local copy of x, residual, and A
126  impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
127  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
128  impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
129  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
130  impl_scalar_type* local_x =
131  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
132  blocksize * sizeof(impl_scalar_type)));
133 
134  magnitude_type norm = 0;
135  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
136  if (col) member.team_barrier();
137  // y -= Rx
138  // Initialize accumulation arrays
139  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
140  [&](const local_ordinal_type& i) {
141  local_Dinv_residual[i] = 0;
142  local_residual[i] = b(row + i, col);
143  });
144  member.team_barrier();
145 
146  int numEntries = A_x_offsets.extent(2);
147 
148  Kokkos::parallel_for(
149  Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
150  int64_t A_offset = A_x_offsets(rowidx, 0, k);
151  int64_t x_offset = A_x_offsets(rowidx, 1, k);
152  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
153  A_block_cst.assign_data(tpetra_values.data() + A_offset);
154  // Pull x into local memory
155  int64_t remote_cutoff = blocksize * num_local_rows;
156  if (x_offset >= remote_cutoff)
157  xx = &x_remote(x_offset - remote_cutoff, col);
158  else
159  xx = &x(x_offset, col);
160 
161  Kokkos::parallel_for(
162  Kokkos::ThreadVectorRange(member, blocksize),
163  [&](const local_ordinal_type& i) { local_x[i] = xx[i]; });
164 
165  // matvec on block: local_residual -= A_block_cst * local_x
166  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
167  [&](const int k0) {
168  impl_scalar_type val = 0;
169  for (int k1 = 0; k1 < blocksize; k1++)
170  val += A_block_cst(k0, k1) * local_x[k1];
171  Kokkos::atomic_add(local_residual + k0, -val);
172  });
173  }
174  });
175  member.team_barrier();
176  // Compute local_Dinv_residual = D^-1 * local_residual
177  Kokkos::parallel_for(
178  Kokkos::TeamThreadRange(member, blocksize),
179  [&](const local_ordinal_type& k0) {
180  Kokkos::parallel_reduce(
181  Kokkos::ThreadVectorRange(member, blocksize),
182  [&](const local_ordinal_type& k1, impl_scalar_type& update) {
183  update += d_inv(rowidx, k0, k1) * local_residual[k1];
184  },
185  local_Dinv_residual[k0]);
186  });
187  member.team_barrier();
188  // local_Dinv_residual is fully computed. Now compute the
189  // squared y update norm and update y (using damping factor).
190  magnitude_type colNorm;
191  Kokkos::parallel_reduce(
192  Kokkos::TeamVectorRange(member, blocksize),
193  [&](const local_ordinal_type& k, magnitude_type& update) {
194  // Compute the change in y (assuming damping_factor == 1) for this
195  // entry.
196  impl_scalar_type old_y = x(row + k, col);
197  impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
198  if constexpr(Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
199  magnitude_type ydiff =
200  Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
201  update += ydiff * ydiff;
202  }
203  else {
204  update += y_update * y_update;
205  }
206  y(row + k, col) = old_y + damping_factor * y_update;
207  },
208  colNorm);
209  norm += colNorm;
210  }
211  Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
212  }
213 
214  // Launch SinglePass version (owned + nonowned residual, plus Dinv in a single
215  // kernel)
216  void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
217  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
218  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
219  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
220  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
221  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
222  "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
223  ComputeResidualAndSolve0, execution_space);
224 
225  y = y_;
226  b = b_;
227  x = x_;
228  x_remote = x_remote_;
229 
230  const local_ordinal_type blocksize = blocksize_requested;
231  const local_ordinal_type nrows = d_inv.extent(0);
232 
233  const local_ordinal_type team_size = 8;
234  const local_ordinal_type vector_size = 8;
235  // team: local_residual, local_Dinv_residual
236  const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
237  // thread: local_x
238  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
239  Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
240  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
241  Kokkos::PerThread(shmem_thread_size));
242  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::SinglePass",
243  policy, *this);
244  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
245  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
246  }
247 };
248 
249 template <typename MatrixType, int B>
250 struct ComputeResidualAndSolve_2Pass {
251  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
252  using node_device_type = typename impl_type::node_device_type;
253  using execution_space = typename impl_type::execution_space;
254  using memory_space = typename impl_type::memory_space;
255 
256  using local_ordinal_type = typename impl_type::local_ordinal_type;
257  using size_type = typename impl_type::size_type;
258  using impl_scalar_type = typename impl_type::impl_scalar_type;
259  using magnitude_type = typename impl_type::magnitude_type;
261  using local_ordinal_type_1d_view =
262  typename impl_type::local_ordinal_type_1d_view;
263  using size_type_1d_view = typename impl_type::size_type_1d_view;
264  using tpetra_block_access_view_type =
265  typename impl_type::tpetra_block_access_view_type; // block crs (layout
266  // right)
267  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
268  using impl_scalar_type_2d_view_tpetra =
269  typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
270  // (layout left)
271  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
272  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
273  using i64_3d_view = typename impl_type::i64_3d_view;
274 
276  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
277 
278  // enum for max blocksize and vector length
279  enum : int { max_blocksize = 32 };
280 
281  // Tag for computing residual with owned columns only (pass 1)
282  struct OwnedTag {};
283 
284  // Tag for finishing the residual with nonowned columns, and solving/norming
285  // (pass 2)
286  struct NonownedTag {};
287 
288  private:
289  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
290  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
291  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
292  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
293 
294  // AmD information
295  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
296 
297  // blocksize
298  const local_ordinal_type blocksize_requested;
299 
300  // block offsets
301  const ConstUnmanaged<i64_3d_view> A_x_offsets;
302  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
303 
304  // diagonal block inverses
305  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
306 
307  // squared update norms
308  const Unmanaged<impl_scalar_type_1d_view> W;
309 
310  impl_scalar_type damping_factor;
311 
312  public:
313  ComputeResidualAndSolve_2Pass(const AmD<MatrixType>& amd,
314  const btdm_scalar_type_3d_view& d_inv_,
315  const impl_scalar_type_1d_view& W_,
316  const local_ordinal_type& blocksize_requested_,
317  const impl_scalar_type& damping_factor_)
318  : tpetra_values(amd.tpetra_values),
319  blocksize_requested(blocksize_requested_),
320  A_x_offsets(amd.A_x_offsets),
321  A_x_offsets_remote(amd.A_x_offsets_remote),
322  d_inv(d_inv_),
323  W(W_),
324  damping_factor(damping_factor_) {}
325 
326  KOKKOS_INLINE_FUNCTION
327  void operator()(const OwnedTag, const member_type& member) const {
328  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
329  const local_ordinal_type rowidx = member.league_rank();
330  const local_ordinal_type row = rowidx * blocksize;
331  const local_ordinal_type num_vectors = b.extent(1);
332 
333  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
334  NULL, blocksize, blocksize);
335 
336  // Get shared allocation for a local copy of x, Ax, and A
337  impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
338  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
339  impl_scalar_type* local_x =
340  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
341  blocksize * sizeof(impl_scalar_type)));
342 
343  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
344  if (col) member.team_barrier();
345  // y -= Rx
346  // Initialize accumulation arrays
347  Kokkos::parallel_for(
348  Kokkos::TeamVectorRange(member, blocksize),
349  [&](const local_ordinal_type& i) { local_residual[i] = b(row + i, col); });
350  member.team_barrier();
351 
352  int numEntries = A_x_offsets.extent(2);
353 
354  Kokkos::parallel_for(
355  Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
356  int64_t A_offset = A_x_offsets(rowidx, 0, k);
357  int64_t x_offset = A_x_offsets(rowidx, 1, k);
358  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
359  A_block_cst.assign_data(tpetra_values.data() + A_offset);
360  // Pull x into local memory
361  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
362  [&](const local_ordinal_type& i) {
363  local_x[i] = x(x_offset + i, col);
364  });
365 
366  // MatVec op Ax += A*x
367  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
368  [&](const local_ordinal_type& k0) {
369  impl_scalar_type val = 0;
370  for (int k1 = 0; k1 < blocksize; k1++)
371  val += A_block_cst(k0, k1) * local_x[k1];
372  Kokkos::atomic_add(local_residual + k0, -val);
373  });
374  }
375  });
376  member.team_barrier();
377  // Write back the partial residual to y
378  if (member.team_rank() == 0) {
379  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
380  [&](const local_ordinal_type& k) {
381  y(row + k, col) = local_residual[k];
382  });
383  }
384  }
385  }
386 
387  KOKKOS_INLINE_FUNCTION
388  void operator()(const NonownedTag, const member_type& member) const {
389  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
390  const local_ordinal_type rowidx = member.league_rank();
391  const local_ordinal_type row = rowidx * blocksize;
392  const local_ordinal_type num_vectors = b.extent(1);
393 
394  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
395  NULL, blocksize, blocksize);
396 
397  // Get shared allocation for a local copy of x, Ax, and A
398  impl_scalar_type* local_residual = reinterpret_cast<impl_scalar_type*>(
399  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
400  impl_scalar_type* local_Dinv_residual = reinterpret_cast<impl_scalar_type*>(
401  member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
402  impl_scalar_type* local_x =
403  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
404  blocksize * sizeof(impl_scalar_type)));
405 
406  magnitude_type norm = 0;
407  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
408  if (col) member.team_barrier();
409  // y -= Rx
410  // Initialize accumulation arrays.
411  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
412  [&](const local_ordinal_type& i) {
413  local_Dinv_residual[i] = 0;
414  local_residual[i] = y(row + i, col);
415  });
416  member.team_barrier();
417 
418  int numEntries = A_x_offsets_remote.extent(2);
419 
420  Kokkos::parallel_for(
421  Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
422  int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
423  int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
424  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
425  A_block_cst.assign_data(tpetra_values.data() + A_offset);
426  // Pull x into local memory
427  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
428  [&](const local_ordinal_type& i) {
429  local_x[i] = x_remote(x_offset + i, col);
430  });
431 
432  // matvec on block: local_residual -= A_block_cst * local_x
433  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
434  [&](const int k0) {
435  impl_scalar_type val = 0;
436  for (int k1 = 0; k1 < blocksize; k1++)
437  val += A_block_cst(k0, k1) * local_x[k1];
438  Kokkos::atomic_add(local_residual + k0, -val);
439  });
440  }
441  });
442  member.team_barrier();
443  // Compute local_Dinv_residual = D^-1 * local_residual
444  Kokkos::parallel_for(
445  Kokkos::TeamThreadRange(member, blocksize),
446  [&](const local_ordinal_type& k0) {
447  Kokkos::parallel_reduce(
448  Kokkos::ThreadVectorRange(member, blocksize),
449  [&](const local_ordinal_type& k1, impl_scalar_type& update) {
450  update += d_inv(rowidx, k0, k1) * local_residual[k1];
451  },
452  local_Dinv_residual[k0]);
453  });
454  member.team_barrier();
455  // local_Dinv_residual is fully computed. Now compute the
456  // squared y update norm and update y (using damping factor).
457  magnitude_type colNorm;
458  Kokkos::parallel_reduce(
459  Kokkos::TeamVectorRange(member, blocksize),
460  [&](const local_ordinal_type& k, magnitude_type& update) {
461  // Compute the change in y (assuming damping_factor == 1) for this
462  // entry.
463  impl_scalar_type old_y = x(row + k, col);
464  impl_scalar_type y_update = local_Dinv_residual[k] - old_y;
465  if constexpr(Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
466  magnitude_type ydiff =
467  Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
468  update += ydiff * ydiff;
469  }
470  else {
471  update += y_update * y_update;
472  }
473  y(row + k, col) = old_y + damping_factor * y_update;
474  },
475  colNorm);
476  norm += colNorm;
477  }
478  Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
479  }
480 
481  // Launch pass 1 of the 2-pass version.
482  // This computes just the owned part of residual and writes that back to y.
483  void run_pass1(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
484  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
485  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
486  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
487  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
488  "BlockTriDi::ComputeResidualAndSolve::RunPass1",
489  ComputeResidualAndSolve0, execution_space);
490 
491  b = b_;
492  x = x_;
493  y = y_;
494 
495  const local_ordinal_type blocksize = blocksize_requested;
496  const local_ordinal_type nrows = d_inv.extent(0);
497 
498  const local_ordinal_type team_size = 8;
499  const local_ordinal_type vector_size = 8;
500  const size_t shmem_team_size = blocksize * sizeof(impl_scalar_type);
501  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
502  Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
503  vector_size);
504  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
505  Kokkos::PerThread(shmem_thread_size));
506  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
507  *this);
508  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
509  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
510  }
511 
512  // Launch pass 2 of the 2-pass version.
513  // This finishes computing residual with x_remote,
514  // and then applies Dinv and computes norm.
515  void run_pass2(
516  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
517  const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
518  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
519  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
520  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
521  "BlockTriDi::ComputeResidualAndSolve::RunPass2",
522  ComputeResidualAndSolve0, execution_space);
523 
524  x = x_;
525  x_remote = x_remote_;
526  y = y_;
527 
528  const local_ordinal_type blocksize = blocksize_requested;
529  const local_ordinal_type nrows = d_inv.extent(0);
530 
531  const local_ordinal_type team_size = 8;
532  const local_ordinal_type vector_size = 8;
533  const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
534  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
535  Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
536  vector_size);
537  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
538  Kokkos::PerThread(shmem_thread_size));
539  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
540  *this);
541  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
542  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
543  }
544 };
545 
546 template <typename MatrixType, int B>
547 struct ComputeResidualAndSolve_SolveOnly {
548  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
549  using node_device_type = typename impl_type::node_device_type;
550  using execution_space = typename impl_type::execution_space;
551  using memory_space = typename impl_type::memory_space;
552 
553  using local_ordinal_type = typename impl_type::local_ordinal_type;
554  using size_type = typename impl_type::size_type;
555  using impl_scalar_type = typename impl_type::impl_scalar_type;
556  using magnitude_type = typename impl_type::magnitude_type;
558  using local_ordinal_type_1d_view =
559  typename impl_type::local_ordinal_type_1d_view;
560  using size_type_1d_view = typename impl_type::size_type_1d_view;
561  using tpetra_block_access_view_type =
562  typename impl_type::tpetra_block_access_view_type; // block crs (layout
563  // right)
564  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
565  using impl_scalar_type_2d_view_tpetra =
566  typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector
567  // (layout left)
568  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
569  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
570  using i64_3d_view = typename impl_type::i64_3d_view;
571 
573  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
574 
575  // enum for max blocksize and vector length
576  enum : int { max_blocksize = 32 };
577 
578  private:
579  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
580  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
581  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
582  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
583 
584  // AmD information
585  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
586 
587  // blocksize
588  const local_ordinal_type blocksize_requested;
589 
590  // block offsets
591  const ConstUnmanaged<i64_3d_view> A_x_offsets;
592  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
593 
594  // diagonal block inverses
595  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
596 
597  // squared update norms
598  const Unmanaged<impl_scalar_type_1d_view> W;
599 
600  impl_scalar_type damping_factor;
601 
602  public:
603  ComputeResidualAndSolve_SolveOnly(
604  const AmD<MatrixType>& amd, const btdm_scalar_type_3d_view& d_inv_,
605  const impl_scalar_type_1d_view& W_,
606  const local_ordinal_type& blocksize_requested_,
607  const impl_scalar_type& damping_factor_)
608  : tpetra_values(amd.tpetra_values),
609  blocksize_requested(blocksize_requested_),
610  A_x_offsets(amd.A_x_offsets),
611  A_x_offsets_remote(amd.A_x_offsets_remote),
612  d_inv(d_inv_),
613  W(W_),
614  damping_factor(damping_factor_) {}
615 
616  KOKKOS_INLINE_FUNCTION
617  void operator()(const member_type& member) const {
618  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
619  const local_ordinal_type rowidx =
620  member.league_rank() * member.team_size() + member.team_rank();
621  const local_ordinal_type row = rowidx * blocksize;
622  const local_ordinal_type num_vectors = b.extent(1);
623 
624  // Get shared allocation for a local copy of x, Ax, and A
625  impl_scalar_type* local_Dinv_residual =
626  reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
627  blocksize * sizeof(impl_scalar_type)));
628 
629  if (rowidx >= (local_ordinal_type)d_inv.extent(0)) return;
630 
631  magnitude_type norm = 0;
632  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
633  // Compute local_Dinv_residual = D^-1 * local_residual
634  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
635  [&](const local_ordinal_type& k0) {
636  impl_scalar_type val = 0;
637  for (local_ordinal_type k1 = 0; k1 < blocksize;
638  k1++) {
639  val += d_inv(rowidx, k0, k1) * b(row + k1, col);
640  }
641  local_Dinv_residual[k0] = val;
642  });
643 
644  magnitude_type colNorm;
645  Kokkos::parallel_reduce(
646  Kokkos::ThreadVectorRange(member, blocksize),
647  [&](const local_ordinal_type& k, magnitude_type& update) {
648  // Compute the change in y (assuming damping_factor == 1) for this
649  // entry.
650  impl_scalar_type y_update = local_Dinv_residual[k];
651  if constexpr(Kokkos::ArithTraits<impl_scalar_type>::is_complex) {
652  magnitude_type ydiff =
653  Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
654  update += ydiff * ydiff;
655  }
656  else {
657  update += y_update * y_update;
658  }
659  y(row + k, col) = damping_factor * y_update;
660  },
661  colNorm);
662  norm += colNorm;
663  }
664  Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
665  }
666 
667  // ComputeResidualAndSolve_SolveOnly::run does the solve for the first
668  // iteration, when the initial guess for y is zero. This means the residual
669  // vector is just b. The kernel applies the inverse diags to b to find y, and
670  // also puts the partial squared update norms (1 per row) into W.
671  void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
672  const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
673  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
674  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
675  "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
676  ComputeResidualAndSolve0, execution_space);
677 
678  y = y_;
679  b = b_;
680 
681  const local_ordinal_type blocksize = blocksize_requested;
682  const local_ordinal_type nrows = d_inv.extent(0);
683 
684  const local_ordinal_type team_size = 8;
685  const local_ordinal_type vector_size = 8;
686  const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
687  Kokkos::TeamPolicy<execution_space> policy(
688  (nrows + team_size - 1) / team_size, team_size, vector_size);
689  policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
690  Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::y_zero", policy,
691  *this);
692  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
693  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
694  }
695 };
696 
697 } // namespace Ifpack2::BlockHelperDetails
698 
699 #endif
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:276
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:321