Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockComputeResidualVector.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
12 
13 #include "Ifpack2_BlockHelper.hpp"
14 
15 namespace Ifpack2 {
16 
17 namespace BlockHelperDetails {
18 
22 template <typename MatrixType>
23 struct AmD {
25  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
26  using size_type_1d_view = typename impl_type::size_type_1d_view;
27  using i64_3d_view = typename impl_type::i64_3d_view;
28  using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
29  // rowptr points to the start of each row of A_colindsub.
30  size_type_1d_view rowptr, rowptr_remote;
31  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
32  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
33  // where g is A's graph, are the columns AmD uses. If seq_method_, then
34  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
35  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
36  // contains the remote ones.
37  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
38  // Precomputed direct offsets to A,x blocks, for owned entries (OverlapTag case) or all entries (AsyncTag case)
39  i64_3d_view A_x_offsets;
40  // Precomputed direct offsets to A,x blocks, for non-owned entries (OverlapTag case). For AsyncTag case this is left empty.
41  i64_3d_view A_x_offsets_remote;
42 
43  // Currently always true.
44  bool is_tpetra_block_crs;
45 
46  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
47  impl_scalar_type_1d_view_tpetra tpetra_values;
48 
49  AmD() = default;
50  AmD(const AmD &b) = default;
51 };
52 
53 template <typename MatrixType>
54 struct PartInterface {
55  using local_ordinal_type = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
56  using local_ordinal_type_1d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
57  using local_ordinal_type_2d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
58 
59  PartInterface() = default;
60  PartInterface(const PartInterface &b) = default;
61 
62  // Some terms:
63  // The matrix A is split as A = D + R, where D is the matrix of tridiag
64  // blocks and R is the remainder.
65  // A part is roughly a synonym for a tridiag. The distinction is that a part
66  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
67  // rows in R associated with that tridiag. In contrast, the term tridiag is
68  // used to refer specifically to tridiag data, such as the pointer into the
69  // tridiag data array.
70  // Local (lcl) row are the LIDs. lclrow lists the LIDs belonging to each
71  // tridiag, and partptr points to the beginning of each tridiag. This is the
72  // LID space.
73  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
74  // by this ordinal. This is the 'index' space.
75  // A flat index is the mathematical index into an array. A pack index
76  // accounts for SIMD packing.
77 
78  // Local row LIDs. Permutation from caller's index space to tridiag index
79  // space.
80  local_ordinal_type_1d_view lclrow;
81  // partptr_ is the pointer array into lclrow_.
82  local_ordinal_type_1d_view partptr; // np+1
83  local_ordinal_type_2d_view partptr_sub;
84  local_ordinal_type_1d_view partptr_schur;
85  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
86  // is the start of the i'th pack.
87  local_ordinal_type_1d_view packptr; // npack+1
88  local_ordinal_type_1d_view packptr_sub;
89  local_ordinal_type_1d_view packindices_sub;
90  local_ordinal_type_2d_view packindices_schur;
91  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
92  // an alias of partptr_ in the case of no overlap.
93  local_ordinal_type_1d_view part2rowidx0; // np+1
94  local_ordinal_type_1d_view part2rowidx0_sub;
95  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
96  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
97  // with i % vector_length to get the location in the packed data.
98  local_ordinal_type_1d_view part2packrowidx0; // np+1
99  local_ordinal_type_2d_view part2packrowidx0_sub;
100  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
101  // rowidx2part_ maps the row index to the part index.
102  local_ordinal_type_1d_view rowidx2part; // nr
103  local_ordinal_type_1d_view rowidx2part_sub;
104  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
105  // practice, this knowledge is not particularly useful, as packing for batched
106  // processing is done at the same time as the permutation from LID to index
107  // space. But it's easy to detect, so it's recorded in case an optimization
108  // can be made based on it.
109  bool row_contiguous;
110 
111  local_ordinal_type max_partsz;
112  local_ordinal_type max_subpartsz;
113  local_ordinal_type n_subparts_per_part;
114  local_ordinal_type nparts;
115 };
116 
117 // Precompute offsets of each A and x entry to speed up residual.
118 // (Applies for hasBlockCrsMatrix == true and OverlapTag/AsyncTag)
119 // Reading A, x take up to 4, 6 levels of indirection respectively,
120 // but precomputing the offsets reduces it to 2 for both.
121 //
122 // This function allocates and populates these members of AmD:
123 // A_x_offsets, A_x_offsets_remote
124 template <typename MatrixType>
125 void precompute_A_x_offsets(
126  AmD<MatrixType> &amd,
127  const PartInterface<MatrixType> &interf,
128  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_crs_graph_type> &g,
129  const typename ImplType<MatrixType>::local_ordinal_type_1d_view &dm2cm,
130  int blocksize,
131  bool ownedRemoteSeparate) {
132  using impl_type = ImplType<MatrixType>;
133  using i64_3d_view = typename impl_type::i64_3d_view;
134  using size_type = typename impl_type::size_type;
135  using local_ordinal_type = typename impl_type::local_ordinal_type;
136  using execution_space = typename impl_type::execution_space;
137  auto local_graph = g->getLocalGraphDevice();
138  const auto A_block_rowptr = local_graph.row_map;
139  const auto A_colind = local_graph.entries;
140  local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
141  int blocksize_square = blocksize * blocksize;
142  // shallow-copying views to avoid capturing the amd, interf objects in lambdas
143  auto lclrow = interf.lclrow;
144  auto A_colindsub = amd.A_colindsub;
145  auto A_colindsub_remote = amd.A_colindsub_remote;
146  auto rowptr = amd.rowptr;
147  auto rowptr_remote = amd.rowptr_remote;
148  bool is_dm2cm_active = dm2cm.extent(0);
149  if (ownedRemoteSeparate) {
150  // amd.rowptr points to owned entries only, and amd.rowptr_remote points to nonowned.
151  local_ordinal_type maxOwnedEntriesPerRow = 0;
152  local_ordinal_type maxNonownedEntriesPerRow = 0;
153  Kokkos::parallel_reduce(
154  Kokkos::RangePolicy<execution_space>(0, numLocalRows),
155  KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmaxOwned, local_ordinal_type & lmaxNonowned) {
156  const local_ordinal_type lr = lclrow(i);
157  local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowptr(lr);
158  if (rowNumOwned > lmaxOwned)
159  lmaxOwned = rowNumOwned;
160  // rowptr_remote won't be allocated for single-rank problems
161  if (rowptr_remote.extent(0)) {
162  local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowptr_remote(lr);
163  if (rowNumNonowned > lmaxNonowned)
164  lmaxNonowned = rowNumNonowned;
165  } else {
166  lmaxNonowned = 0;
167  }
168  },
169  Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
170  // Allocate the two offsets views now that we know the dimensions
171  // For each one, the middle dimension is 0 for A offsets and 1 for x offsets.
172  // Packing them together in one view improves cache line utilization
173  amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
174  amd.A_x_offsets_remote = i64_3d_view("amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
175  auto A_x_offsets = amd.A_x_offsets;
176  auto A_x_offsets_remote = amd.A_x_offsets_remote;
177  // Now, populate all the offsets. Use ArithTraits<int64_t>::min to mark absent entries.
178  Kokkos::parallel_for(
179  Kokkos::RangePolicy<execution_space>(0, numLocalRows),
180  KOKKOS_LAMBDA(local_ordinal_type i) {
181  const local_ordinal_type lr = lclrow(i);
182  const size_type A_k0 = A_block_rowptr(lr);
183  // Owned entries
184  size_type rowBegin = rowptr(lr);
185  local_ordinal_type rowNumOwned = rowptr(lr + 1) - rowBegin;
186  for (local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
187  if (entry < rowNumOwned) {
188  const size_type j = A_k0 + A_colindsub(rowBegin + entry);
189  const local_ordinal_type A_colind_at_j = A_colind(j);
190  const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
191  A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
192  A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
193  } else {
194 #if KOKKOS_VERSION >= 40799
195  A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
196 #else
197  A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
198 #endif
199 #if KOKKOS_VERSION >= 40799
200  A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
201 #else
202  A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
203 #endif
204  }
205  }
206  // Nonowned entries
207  if (rowptr_remote.extent(0)) {
208  rowBegin = rowptr_remote(lr);
209  local_ordinal_type rowNumNonowned = rowptr_remote(lr + 1) - rowBegin;
210  for (local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
211  if (entry < rowNumNonowned) {
212  const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
213  const local_ordinal_type A_colind_at_j = A_colind(j);
214  const local_ordinal_type loc = A_colind_at_j - numLocalRows;
215  A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
216  A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
217  } else {
218 #if KOKKOS_VERSION >= 40799
219  A_x_offsets_remote(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
220 #else
221  A_x_offsets_remote(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
222 #endif
223 #if KOKKOS_VERSION >= 40799
224  A_x_offsets_remote(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
225 #else
226  A_x_offsets_remote(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
227 #endif
228  }
229  }
230  }
231  });
232  } else {
233  // amd.rowptr points to both owned and nonowned entries, so it tells us how many columns (last dim) A_x_offsets should have.
234  local_ordinal_type maxEntriesPerRow = 0;
235  Kokkos::parallel_reduce(
236  Kokkos::RangePolicy<execution_space>(0, numLocalRows),
237  KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type & lmax) {
238  const local_ordinal_type lr = lclrow(i);
239  local_ordinal_type rowNum = rowptr(lr + 1) - rowptr(lr);
240  if (rowNum > lmax)
241  lmax = rowNum;
242  },
243  Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
244  amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
245  auto A_x_offsets = amd.A_x_offsets;
246  // Populate A,x offsets. Use ArithTraits<int64_t>::min to mark absent entries.
247  // For x offsets, add a shift blocksize*numLocalRows to represent that it indexes into x_remote instead of x.
248  Kokkos::parallel_for(
249  Kokkos::RangePolicy<execution_space>(0, numLocalRows),
250  KOKKOS_LAMBDA(local_ordinal_type i) {
251  const local_ordinal_type lr = lclrow(i);
252  const size_type A_k0 = A_block_rowptr(lr);
253  // Owned entries
254  size_type rowBegin = rowptr(lr);
255  local_ordinal_type rowOwned = rowptr(lr + 1) - rowBegin;
256  for (local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
257  if (entry < rowOwned) {
258  const size_type j = A_k0 + A_colindsub(rowBegin + entry);
259  A_x_offsets(i, 0, entry) = j * blocksize_square;
260  const local_ordinal_type A_colind_at_j = A_colind(j);
261  if (A_colind_at_j < numLocalRows) {
262  const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
263  A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
264  } else {
265  A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
266  }
267  } else {
268 #if KOKKOS_VERSION >= 40799
269  A_x_offsets(i, 0, entry) = KokkosKernels::ArithTraits<int64_t>::min();
270 #else
271  A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
272 #endif
273 #if KOKKOS_VERSION >= 40799
274  A_x_offsets(i, 1, entry) = KokkosKernels::ArithTraits<int64_t>::min();
275 #else
276  A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
277 #endif
278  }
279  }
280  });
281  }
282 }
283 
287 static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
288  const int team_size) {
289  int total_team_size(0);
290  if (blksize <= 5)
291  total_team_size = 32;
292  else if (blksize <= 9)
293  total_team_size = 32; // 64
294  else if (blksize <= 12)
295  total_team_size = 96;
296  else if (blksize <= 16)
297  total_team_size = 128;
298  else if (blksize <= 20)
299  total_team_size = 160;
300  else
301  total_team_size = 160;
302  return total_team_size / team_size;
303 }
304 
305 static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
306  const int team_size) {
307  int total_team_size(0);
308  if (blksize <= 5)
309  total_team_size = 32;
310  else if (blksize <= 9)
311  total_team_size = 32; // 64
312  else if (blksize <= 12)
313  total_team_size = 96;
314  else if (blksize <= 16)
315  total_team_size = 128;
316  else if (blksize <= 20)
317  total_team_size = 160;
318  else
319  total_team_size = 160;
320  return total_team_size / team_size;
321 }
322 
323 static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
324  const int team_size) {
325  int total_team_size(0);
326  if (blksize <= 5)
327  total_team_size = 32;
328  else if (blksize <= 9)
329  total_team_size = 32; // 64
330  else if (blksize <= 12)
331  total_team_size = 96;
332  else if (blksize <= 16)
333  total_team_size = 128;
334  else if (blksize <= 20)
335  total_team_size = 160;
336  else
337  total_team_size = 160;
338  return total_team_size / team_size;
339 }
340 
341 template <typename T>
342 static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
343  const int team_size) {
344  if (is_cuda<T>::value)
345  return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
346  if (is_hip<T>::value)
347  return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
348  if (is_sycl<T>::value)
349  return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
350  return -1;
351 }
352 
353 template <typename MatrixType>
354 struct ComputeResidualVector {
355  public:
356  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
357  using node_device_type = typename impl_type::node_device_type;
358  using execution_space = typename impl_type::execution_space;
359  using memory_space = typename impl_type::memory_space;
360 
361  using local_ordinal_type = typename impl_type::local_ordinal_type;
362  using size_type = typename impl_type::size_type;
363  using impl_scalar_type = typename impl_type::impl_scalar_type;
364  using magnitude_type = typename impl_type::magnitude_type;
365  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
366  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
368  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
369  using size_type_1d_view = typename impl_type::size_type_1d_view;
370  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
371  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
372  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
373  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
374  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
375  using i64_3d_view = typename impl_type::i64_3d_view;
376  static constexpr int vector_length = impl_type::vector_length;
377 
379  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
380 
381  // enum for max blocksize and vector length
382  enum : int { max_blocksize = 32 };
383 
384  private:
385  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
386  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
387  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
388  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
389  Unmanaged<vector_type_3d_view> y_packed;
390  Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
391 
392  // AmD information
393  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
394  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
395  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
396 
397  // block crs graph information
398  // for cuda (kokkos crs graph uses a different size_type from size_t)
399  const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_block_rowptr;
400  const ConstUnmanaged<Kokkos::View<size_t *, node_device_type>> A_point_rowptr;
401  const ConstUnmanaged<Kokkos::View<local_ordinal_type *, node_device_type>> A_colind;
402 
403  // blocksize
404  const local_ordinal_type blocksize_requested;
405 
406  // part interface
407  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
408  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
409  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
410  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
411  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
412  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
413 
414  // block offsets
415  const ConstUnmanaged<i64_3d_view> A_x_offsets;
416  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
417 
418  const bool is_dm2cm_active;
419  const bool hasBlockCrsMatrix;
420 
421  public:
422  template <typename LocalCrsGraphType>
423  ComputeResidualVector(const AmD<MatrixType> &amd,
424  const LocalCrsGraphType &block_graph,
425  const LocalCrsGraphType &point_graph,
426  const local_ordinal_type &blocksize_requested_,
427  const PartInterface<MatrixType> &interf,
428  const local_ordinal_type_1d_view &dm2cm_,
429  bool hasBlockCrsMatrix_)
430  : rowptr(amd.rowptr)
431  , rowptr_remote(amd.rowptr_remote)
432  , colindsub(amd.A_colindsub)
433  , colindsub_remote(amd.A_colindsub_remote)
434  , tpetra_values(amd.tpetra_values)
435  , A_block_rowptr(block_graph.row_map)
436  , A_point_rowptr(point_graph.row_map)
437  , A_colind(block_graph.entries)
438  , blocksize_requested(blocksize_requested_)
439  , part2packrowidx0(interf.part2packrowidx0)
440  , part2rowidx0(interf.part2rowidx0)
441  , rowidx2part(interf.rowidx2part)
442  , partptr(interf.partptr)
443  , lclrow(interf.lclrow)
444  , dm2cm(dm2cm_)
445  , A_x_offsets(amd.A_x_offsets)
446  , A_x_offsets_remote(amd.A_x_offsets_remote)
447  , is_dm2cm_active(dm2cm_.span() > 0)
448  , hasBlockCrsMatrix(hasBlockCrsMatrix_) {}
449 
450  inline void
451  SerialDot(const local_ordinal_type &blocksize,
452  const local_ordinal_type &lclRowID,
453  const local_ordinal_type &lclColID,
454  const local_ordinal_type &ii,
455  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
456  const impl_scalar_type *const KOKKOS_RESTRICT xx,
457  /* */ impl_scalar_type *KOKKOS_RESTRICT yy) const {
458  const size_type Aj_c = colindsub_(lclColID);
459  auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
460  impl_scalar_type val = 0;
461 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
462 #pragma ivdep
463 #endif
464 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
465 #pragma unroll
466 #endif
467  for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
468  val += tpetra_values(point_row_offset + k1) * xx[k1];
469  yy[ii] -= val;
470  }
471 
472  inline void
473  SerialGemv(const local_ordinal_type &blocksize,
474  const impl_scalar_type *const KOKKOS_RESTRICT AA,
475  const impl_scalar_type *const KOKKOS_RESTRICT xx,
476  /* */ impl_scalar_type *KOKKOS_RESTRICT yy) const {
477  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
478  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
479  impl_scalar_type val = 0;
480 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
481 #pragma ivdep
482 #endif
483 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
484 #pragma unroll
485 #endif
486  for (local_ordinal_type k1 = 0; k1 < blocksize; ++k1)
487  val += AA[tlb::getFlatIndex(k0, k1, blocksize)] * xx[k1];
488  yy[k0] -= val;
489  }
490  }
491 
492  template <typename bbViewType, typename yyViewType>
493  KOKKOS_INLINE_FUNCTION void
494  VectorCopy(const member_type &member,
495  const local_ordinal_type &blocksize,
496  const bbViewType &bb,
497  const yyViewType &yy) const {
498  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
499  yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
500  });
501  }
502 
503  template <typename xxViewType, typename yyViewType>
504  KOKKOS_INLINE_FUNCTION void
505  VectorDot(const member_type &member,
506  const local_ordinal_type &blocksize,
507  const local_ordinal_type &lclRowID,
508  const local_ordinal_type &lclColID,
509  const local_ordinal_type &ii,
510  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub_,
511  const xxViewType &xx,
512  const yyViewType &yy) const {
513  const size_type Aj_c = colindsub_(lclColID);
514  auto point_row_offset = A_point_rowptr(lclRowID * blocksize + ii) + Aj_c * blocksize;
515  impl_scalar_type val = 0;
516  Kokkos::parallel_reduce(
517  Kokkos::ThreadVectorRange(member, blocksize),
518  [&](const local_ordinal_type &k1, impl_scalar_type &update) {
519  update += tpetra_values(point_row_offset + k1) * xx(k1);
520  },
521  val);
522  Kokkos::single(Kokkos::PerThread(member),
523  [=]() {
524  Kokkos::atomic_add(&yy(ii), typename yyViewType::const_value_type(-val));
525  });
526  }
527 
528  // BMK: This version coalesces accesses to AA for LayoutRight blocks.
529  template <typename AAViewType, typename xxViewType, typename yyViewType>
530  KOKKOS_INLINE_FUNCTION void
531  VectorGemv(const member_type &member,
532  const local_ordinal_type &blocksize,
533  const AAViewType &AA,
534  const xxViewType &xx,
535  const yyViewType &yy) const {
536  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0) {
537  impl_scalar_type val = 0;
538  Kokkos::parallel_reduce(
539  Kokkos::ThreadVectorRange(member, blocksize),
540  [&](const local_ordinal_type &k1, impl_scalar_type &update) {
541  update += AA(k0, k1) * xx(k1);
542  },
543  val);
544  Kokkos::single(Kokkos::PerThread(member),
545  [=]() {
546  Kokkos::atomic_add(&yy(k0), -val);
547  });
548  }
549  }
550 
551  struct SeqTag {};
552 
553  KOKKOS_INLINE_FUNCTION
554  void
555  operator()(const SeqTag &, const local_ordinal_type &i) const {
556  const local_ordinal_type blocksize = blocksize_requested;
557  const local_ordinal_type blocksize_square = blocksize * blocksize;
558 
559  // constants
560  const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
561  const local_ordinal_type num_vectors = y.extent(1);
562  const local_ordinal_type row = i * blocksize;
563  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
564  // y := b
565  impl_scalar_type *yy = &y(row, col);
566  const impl_scalar_type *const bb = &b(row, col);
567  memcpy(yy, bb, sizeof(impl_scalar_type) * blocksize);
568 
569  // y -= Rx
570  const size_type A_k0 = A_block_rowptr[i];
571  for (size_type k = rowptr[i]; k < rowptr[i + 1]; ++k) {
572  const size_type j = A_k0 + colindsub[k];
573  const impl_scalar_type *const xx = &x(A_colind[j] * blocksize, col);
574  if (hasBlockCrsMatrix) {
575  const impl_scalar_type *const AA = &tpetra_values(j * blocksize_square);
576  SerialGemv(blocksize, AA, xx, yy);
577  } else {
578  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
579  SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
580  }
581  }
582  }
583  }
584 
585  KOKKOS_INLINE_FUNCTION
586  void
587  operator()(const SeqTag &, const member_type &member) const {
588  // constants
589  const local_ordinal_type blocksize = blocksize_requested;
590  const local_ordinal_type blocksize_square = blocksize * blocksize;
591 
592  const local_ordinal_type lr = member.league_rank();
593  const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
594  const local_ordinal_type num_vectors = y.extent(1);
595 
596  // subview pattern
597  auto bb = Kokkos::subview(b, block_range, 0);
598  auto xx = bb;
599  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
600  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
601 
602  const local_ordinal_type row = lr * blocksize;
603  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
604  // y := b
605  yy.assign_data(&y(row, col));
606  bb.assign_data(&b(row, col));
607  if (member.team_rank() == 0)
608  VectorCopy(member, blocksize, bb, yy);
609  member.team_barrier();
610 
611  // y -= Rx
612  const size_type A_k0 = A_block_rowptr[lr];
613 
614  if (hasBlockCrsMatrix) {
615  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
616  [&](const local_ordinal_type &k) {
617  const size_type j = A_k0 + colindsub[k];
618  xx.assign_data(&x(A_colind[j] * blocksize, col));
619  A_block_cst.assign_data(&tpetra_values(j * blocksize_square));
620  VectorGemv(member, blocksize, A_block_cst, xx, yy);
621  });
622  } else {
623  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr + 1]),
624  [&](const local_ordinal_type &k) {
625  const size_type j = A_k0 + colindsub[k];
626  xx.assign_data(&x(A_colind[j] * blocksize, col));
627 
628  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
629  VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
630  });
631  }
632  }
633  }
634 
635  // * B: block size for compile-time specialization, or 0 for general case (up to max_blocksize)
636  // * async: true if using async importer. overlap is not used in this case.
637  // Whether a column is owned or nonowned is decided at runtime.
638  // * overlap: true if processing the columns that are not locally owned,
639  // false if processing locally owned columns.
640  // * haveBlockMatrix: true if A is a BlockCrsMatrix, false if it's CrsMatrix.
641  template <int B, bool async, bool overlap, bool haveBlockMatrix>
642  struct GeneralTag {
643  static_assert(!(async && overlap),
644  "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
645  };
646 
647  // Define AsyncTag and OverlapTag in terms of GeneralTag:
648  // P == 0 means only compute on owned columns
649  // P == 1 means only compute on nonowned columns
650  template <int P, int B, bool haveBlockMatrix>
651  using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
652 
653  template <int B, bool haveBlockMatrix>
654  using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
655 
656  // CPU implementation for all cases
657  template <int B, bool async, bool overlap, bool haveBlockMatrix>
658  KOKKOS_INLINE_FUNCTION void
659  operator()(const GeneralTag<B, async, overlap, haveBlockMatrix> &, const local_ordinal_type &rowidx) const {
660  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
661 
662  // constants
663  const local_ordinal_type partidx = rowidx2part(rowidx);
664  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
665  const local_ordinal_type v = partidx % vector_length;
666 
667  const local_ordinal_type num_vectors = y_packed.extent(2);
668  const local_ordinal_type num_local_rows = lclrow.extent(0);
669 
670  // temporary buffer for y flat
671  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
672 
673  const local_ordinal_type lr = lclrow(rowidx);
674 
675  auto colindsub_used = overlap ? colindsub_remote : colindsub;
676  auto rowptr_used = overlap ? rowptr_remote : rowptr;
677 
678  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
679  if constexpr (overlap) {
680  // y (temporary) := 0
681  memset((void *)yy, 0, sizeof(impl_scalar_type) * blocksize);
682  } else {
683  // y := b
684  const local_ordinal_type row = lr * blocksize;
685  memcpy(yy, &b(row, col), sizeof(impl_scalar_type) * blocksize);
686  }
687 
688  // y -= Rx
689  const size_type A_k0 = A_block_rowptr[lr];
690  for (size_type k = rowptr_used[lr]; k < rowptr_used[lr + 1]; ++k) {
691  const size_type j = A_k0 + colindsub_used[k];
692  const local_ordinal_type A_colind_at_j = A_colind[j];
693  if constexpr (haveBlockMatrix) {
694  const local_ordinal_type blocksize_square = blocksize * blocksize;
695  const impl_scalar_type *const AA = &tpetra_values(j * blocksize_square);
696  if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
697  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
698  const impl_scalar_type *const xx = &x(loc * blocksize, col);
699  SerialGemv(blocksize, AA, xx, yy);
700  } else {
701  const auto loc = A_colind_at_j - num_local_rows;
702  const impl_scalar_type *const xx_remote = &x_remote(loc * blocksize, col);
703  SerialGemv(blocksize, AA, xx_remote, yy);
704  }
705  } else {
706  if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
707  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
708  const impl_scalar_type *const xx = &x(loc * blocksize, col);
709  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
710  SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
711  } else {
712  const auto loc = A_colind_at_j - num_local_rows;
713  const impl_scalar_type *const xx_remote = &x_remote(loc * blocksize, col);
714  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
715  SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
716  }
717  }
718  }
719  // move yy to y_packed
720  if constexpr (overlap) {
721  for (local_ordinal_type k = 0; k < blocksize; ++k)
722  y_packed(pri, k, col)[v] += yy[k];
723  } else {
724  for (local_ordinal_type k = 0; k < blocksize; ++k)
725  y_packed(pri, k, col)[v] = yy[k];
726  }
727  }
728  }
729 
730  // GPU implementation for hasBlockCrsMatrix == true
731  template <int B, bool async, bool overlap>
732  KOKKOS_INLINE_FUNCTION void
733  operator()(const GeneralTag<B, async, overlap, true> &, const member_type &member) const {
734  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
735 
736  // constants
737  const local_ordinal_type rowidx = member.league_rank();
738  const local_ordinal_type partidx = rowidx2part(rowidx);
739  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
740  const local_ordinal_type v = partidx % vector_length;
741 
742  const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
743  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
744  const local_ordinal_type num_local_rows = lclrow.extent(0);
745 
746  // subview pattern
747  auto bb = Kokkos::subview(b, block_range, 0);
748  auto xx = bb;
749  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
750  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
751 
752  // Get shared allocation for a local copy of x, Ax, and A
753  impl_scalar_type *local_Ax = reinterpret_cast<impl_scalar_type *>(member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
754  impl_scalar_type *local_x = reinterpret_cast<impl_scalar_type *>(member.thread_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
755 
756  const local_ordinal_type lr = lclrow(rowidx);
757  const local_ordinal_type row = lr * blocksize;
758  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
759  if (col)
760  member.team_barrier();
761  // y -= Rx
762  // Initialize accumulation array
763  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
764  local_Ax[i] = 0;
765  });
766  member.team_barrier();
767 
768  int numEntries;
769  if constexpr (!overlap) {
770  numEntries = A_x_offsets.extent(2);
771  } else {
772  numEntries = A_x_offsets_remote.extent(2);
773  }
774 
775  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numEntries),
776  [&](const int k) {
777  int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
778  int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
779 #if KOKKOS_VERSION >= 40799
780  if (A_offset != KokkosKernels::ArithTraits<int64_t>::min()) {
781 #else
782  if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
783 #endif
784  A_block_cst.assign_data(tpetra_values.data() + A_offset);
785  // Pull x into local memory
786  if constexpr (async) {
787  size_type remote_cutoff = blocksize * num_local_rows;
788  if (x_offset >= remote_cutoff)
789  xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
790  else
791  xx.assign_data(&x(x_offset, col));
792  } else {
793  if constexpr (!overlap) {
794  xx.assign_data(&x(x_offset, col));
795  } else {
796  xx.assign_data(&x_remote(x_offset, col));
797  }
798  }
799 
800  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
801  local_x[i] = xx(i);
802  });
803 
804  // MatVec op Ax += A*x
805  Kokkos::parallel_for(
806  Kokkos::ThreadVectorRange(member, blocksize),
807  [&](const local_ordinal_type &k0) {
808  impl_scalar_type val = 0;
809  for (int k1 = 0; k1 < blocksize; k1++)
810  val += A_block_cst(k0, k1) * local_x[k1];
811  Kokkos::atomic_add(local_Ax + k0, val);
812  });
813  }
814  });
815  member.team_barrier();
816  // Update y = b - local_Ax
817  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
818  bb.assign_data(&b(row, col));
819  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), [&](const local_ordinal_type &i) {
820  if (!overlap)
821  yy(i) = bb(i) - local_Ax[i];
822  else
823  yy(i) -= local_Ax[i];
824  });
825  }
826  }
827 
828  // GPU implementation for hasBlockCrsMatrix == false
829  template <int B, bool async, bool overlap>
830  KOKKOS_INLINE_FUNCTION void
831  operator()(const GeneralTag<B, async, overlap, false> &, const member_type &member) const {
832  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
833 
834  // constants
835  const local_ordinal_type rowidx = member.league_rank();
836  const local_ordinal_type partidx = rowidx2part(rowidx);
837  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
838  const local_ordinal_type v = partidx % vector_length;
839 
840  const Kokkos::pair<local_ordinal_type, local_ordinal_type> block_range(0, blocksize);
841  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
842  const local_ordinal_type num_local_rows = lclrow.extent(0);
843 
844  // subview pattern
845  auto bb = Kokkos::subview(b, block_range, 0);
846  auto xx = bb;
847  auto xx_remote = bb;
848  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
849  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(tpetra_values.data(), blocksize, blocksize);
850  auto colindsub_used = overlap ? colindsub_remote : colindsub;
851  auto rowptr_used = overlap ? rowptr_remote : rowptr;
852 
853  const local_ordinal_type lr = lclrow(rowidx);
854  const local_ordinal_type row = lr * blocksize;
855  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
856  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
857  if (!overlap) {
858  // y := b
859  bb.assign_data(&b(row, col));
860  if (member.team_rank() == 0)
861  VectorCopy(member, blocksize, bb, yy);
862  member.team_barrier();
863  }
864 
865  // y -= Rx
866  const size_type A_k0 = A_block_rowptr[lr];
867  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr + 1]),
868  [&](const local_ordinal_type &k) {
869  const size_type j = A_k0 + colindsub_used[k];
870  const local_ordinal_type A_colind_at_j = A_colind[j];
871  if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
872  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
873  xx.assign_data(&x(loc * blocksize, col));
874  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
875  VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
876  } else {
877  const auto loc = A_colind_at_j - num_local_rows;
878  xx_remote.assign_data(&x_remote(loc * blocksize, col));
879  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
880  VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
881  }
882  });
883  }
884  }
885 
886  // y = b - Rx; seq method
887  template <typename MultiVectorLocalViewTypeY,
888  typename MultiVectorLocalViewTypeB,
889  typename MultiVectorLocalViewTypeX>
890  void run(const MultiVectorLocalViewTypeY &y_,
891  const MultiVectorLocalViewTypeB &b_,
892  const MultiVectorLocalViewTypeX &x_) {
893  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
894  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
895 
896  y = y_;
897  b = b_;
898  x = x_;
899  if constexpr (is_device<execution_space>::value) {
900  const local_ordinal_type blocksize = blocksize_requested;
901  const local_ordinal_type team_size = 8;
902  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
903  const Kokkos::TeamPolicy<execution_space, SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
904  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
905  } else {
906  const Kokkos::RangePolicy<execution_space, SeqTag> policy(0, rowptr.extent(0) - 1);
907  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
908  }
909  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
910  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
911  }
912 
913  // y = b - R (x , x_remote)
914  template <typename MultiVectorLocalViewTypeB,
915  typename MultiVectorLocalViewTypeX,
916  typename MultiVectorLocalViewTypeX_Remote>
917  void run(const vector_type_3d_view &y_packed_,
918  const MultiVectorLocalViewTypeB &b_,
919  const MultiVectorLocalViewTypeX &x_,
920  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
921  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
922  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
923 
924  b = b_;
925  x = x_;
926  x_remote = x_remote_;
927  if constexpr (is_device<execution_space>::value) {
928  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
929  y_packed_.extent(0),
930  y_packed_.extent(1),
931  y_packed_.extent(2),
932  vector_length);
933  } else {
934  y_packed = y_packed_;
935  }
936 
937  if constexpr (is_device<execution_space>::value) {
938  const local_ordinal_type blocksize = blocksize_requested;
939  // local_ordinal_type vl_power_of_two = 1;
940  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
941  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
942  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
943 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
944  { \
945  if (this->hasBlockCrsMatrix) { \
946  const local_ordinal_type team_size = 8; \
947  const local_ordinal_type vector_size = 8; \
948  const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
949  const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
950  Kokkos::TeamPolicy<execution_space, AsyncTag<B, true>> \
951  policy(rowidx2part.extent(0), team_size, vector_size); \
952  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
953  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
954  policy, *this); \
955  } else { \
956  const local_ordinal_type team_size = 8; \
957  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
958  const Kokkos::TeamPolicy<execution_space, AsyncTag<B, false>> \
959  policy(rowidx2part.extent(0), team_size, vector_size); \
960  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<AsyncTag>", \
961  policy, *this); \
962  } \
963  } \
964  break
965  switch (blocksize_requested) {
966  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
967  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
968  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
969  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
970  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
971  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
972  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
973  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
974  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
975  default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
976  }
977 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
978  } else {
979 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
980  { \
981  if (this->hasBlockCrsMatrix) { \
982  const Kokkos::RangePolicy<execution_space, AsyncTag<B, true>> policy(0, rowidx2part.extent(0)); \
983  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
984  policy, *this); \
985  } else { \
986  const Kokkos::RangePolicy<execution_space, AsyncTag<B, false>> policy(0, rowidx2part.extent(0)); \
987  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<AsyncTag>", \
988  policy, *this); \
989  } \
990  } \
991  break
992 
993  switch (blocksize_requested) {
994  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
995  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
996  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
997  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
998  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
999  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1000  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1001  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1002  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1003  default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1004  }
1005 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1006  }
1007  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1008  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
1009  }
1010 
1011  // y = b - R (y , y_remote)
1012  template <typename MultiVectorLocalViewTypeB,
1013  typename MultiVectorLocalViewTypeX,
1014  typename MultiVectorLocalViewTypeX_Remote>
1015  void run(const vector_type_3d_view &y_packed_,
1016  const MultiVectorLocalViewTypeB &b_,
1017  const MultiVectorLocalViewTypeX &x_,
1018  const MultiVectorLocalViewTypeX_Remote &x_remote_,
1019  const bool compute_owned) {
1020  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
1021  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
1022 
1023  b = b_;
1024  x = x_;
1025  x_remote = x_remote_;
1026  if constexpr (is_device<execution_space>::value) {
1027  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type *)y_packed_.data(),
1028  y_packed_.extent(0),
1029  y_packed_.extent(1),
1030  y_packed_.extent(2),
1031  vector_length);
1032  } else {
1033  y_packed = y_packed_;
1034  }
1035 
1036  if constexpr (is_device<execution_space>::value) {
1037  const local_ordinal_type blocksize = blocksize_requested;
1038  // local_ordinal_type vl_power_of_two = 1;
1039  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
1040  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
1041  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
1042 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1043  if (this->hasBlockCrsMatrix) { \
1044  const local_ordinal_type team_size = 8; \
1045  const local_ordinal_type vector_size = 8; \
1046  const size_t shmem_team_size = blocksize * sizeof(btdm_scalar_type); \
1047  const size_t shmem_thread_size = blocksize * sizeof(btdm_scalar_type); \
1048  if (compute_owned) { \
1049  Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, true>> \
1050  policy(rowidx2part.extent(0), team_size, vector_size); \
1051  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1052  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1053  } else { \
1054  Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, true>> \
1055  policy(rowidx2part.extent(0), team_size, vector_size); \
1056  policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), Kokkos::PerThread(shmem_thread_size)); \
1057  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1058  } \
1059  } else { \
1060  const local_ordinal_type team_size = 8; \
1061  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
1062  if (compute_owned) { \
1063  const Kokkos::TeamPolicy<execution_space, OverlapTag<0, B, false>> \
1064  policy(rowidx2part.extent(0), team_size, vector_size); \
1065  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1066  } else { \
1067  const Kokkos::TeamPolicy<execution_space, OverlapTag<1, B, false>> \
1068  policy(rowidx2part.extent(0), team_size, vector_size); \
1069  Kokkos::parallel_for("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1070  } \
1071  } \
1072  break
1073  switch (blocksize_requested) {
1074  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1075  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1076  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1077  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1078  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1079  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1080  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1081  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1082  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1083  default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1084  }
1085 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1086  } else {
1087 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1088  if (this->hasBlockCrsMatrix) { \
1089  if (compute_owned) { \
1090  const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, true>> \
1091  policy(0, rowidx2part.extent(0)); \
1092  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1093  } else { \
1094  const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, true>> \
1095  policy(0, rowidx2part.extent(0)); \
1096  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1097  } \
1098  } else { \
1099  if (compute_owned) { \
1100  const Kokkos::RangePolicy<execution_space, OverlapTag<0, B, false>> \
1101  policy(0, rowidx2part.extent(0)); \
1102  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1103  } else { \
1104  const Kokkos::RangePolicy<execution_space, OverlapTag<1, B, false>> \
1105  policy(0, rowidx2part.extent(0)); \
1106  Kokkos::parallel_for("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1107  } \
1108  } \
1109  break
1110 
1111  switch (blocksize_requested) {
1112  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(3);
1113  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(5);
1114  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(7);
1115  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(9);
1116  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1117  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1118  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1119  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1120  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1121  default: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(0);
1122  }
1123 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1124  }
1125  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1126  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
1127  }
1128 };
1129 
1130 } // namespace BlockHelperDetails
1131 
1132 } // namespace Ifpack2
1133 
1134 #endif
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:309
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:274
Kokkos::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:286
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:358
Definition: Ifpack2_BlockHelper.hpp:270
Definition: Ifpack2_BlockComputeResidualVector.hpp:23