Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockComputeResidualVector.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11 #define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
12 
13 #include "Ifpack2_BlockHelper.hpp"
14 
15 namespace Ifpack2 {
16 
17  namespace BlockHelperDetails {
18 
22  template <typename MatrixType>
23  struct AmD {
25  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
26  using size_type_1d_view = typename impl_type::size_type_1d_view;
27  using i64_3d_view = typename impl_type::i64_3d_view;
28  using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
29  // rowptr points to the start of each row of A_colindsub.
30  size_type_1d_view rowptr, rowptr_remote;
31  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
32  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
33  // where g is A's graph, are the columns AmD uses. If seq_method_, then
34  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
35  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
36  // contains the remote ones.
37  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
38  // Precomputed direct offsets to A,x blocks, for owned entries (OverlapTag case) or all entries (AsyncTag case)
39  i64_3d_view A_x_offsets;
40  // Precomputed direct offsets to A,x blocks, for non-owned entries (OverlapTag case). For AsyncTag case this is left empty.
41  i64_3d_view A_x_offsets_remote;
42 
43  // Currently always true.
44  bool is_tpetra_block_crs;
45 
46  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
47  impl_scalar_type_1d_view_tpetra tpetra_values;
48 
49  AmD() = default;
50  AmD(const AmD &b) = default;
51  };
52 
53  template<typename MatrixType>
54  struct PartInterface {
55  using local_ordinal_type = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
56  using local_ordinal_type_1d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
57  using local_ordinal_type_2d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
58 
59  PartInterface() = default;
60  PartInterface(const PartInterface &b) = default;
61 
62  // Some terms:
63  // The matrix A is split as A = D + R, where D is the matrix of tridiag
64  // blocks and R is the remainder.
65  // A part is roughly a synonym for a tridiag. The distinction is that a part
66  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
67  // rows in R associated with that tridiag. In contrast, the term tridiag is
68  // used to refer specifically to tridiag data, such as the pointer into the
69  // tridiag data array.
70  // Local (lcl) row are the LIDs. lclrow lists the LIDs belonging to each
71  // tridiag, and partptr points to the beginning of each tridiag. This is the
72  // LID space.
73  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
74  // by this ordinal. This is the 'index' space.
75  // A flat index is the mathematical index into an array. A pack index
76  // accounts for SIMD packing.
77 
78  // Local row LIDs. Permutation from caller's index space to tridiag index
79  // space.
80  local_ordinal_type_1d_view lclrow;
81  // partptr_ is the pointer array into lclrow_.
82  local_ordinal_type_1d_view partptr; // np+1
83  local_ordinal_type_2d_view partptr_sub;
84  local_ordinal_type_1d_view partptr_schur;
85  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
86  // is the start of the i'th pack.
87  local_ordinal_type_1d_view packptr; // npack+1
88  local_ordinal_type_1d_view packptr_sub;
89  local_ordinal_type_1d_view packindices_sub;
90  local_ordinal_type_2d_view packindices_schur;
91  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
92  // an alias of partptr_ in the case of no overlap.
93  local_ordinal_type_1d_view part2rowidx0; // np+1
94  local_ordinal_type_1d_view part2rowidx0_sub;
95  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
96  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
97  // with i % vector_length to get the location in the packed data.
98  local_ordinal_type_1d_view part2packrowidx0; // np+1
99  local_ordinal_type_2d_view part2packrowidx0_sub;
100  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
101  // rowidx2part_ maps the row index to the part index.
102  local_ordinal_type_1d_view rowidx2part; // nr
103  local_ordinal_type_1d_view rowidx2part_sub;
104  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
105  // practice, this knowledge is not particularly useful, as packing for batched
106  // processing is done at the same time as the permutation from LID to index
107  // space. But it's easy to detect, so it's recorded in case an optimization
108  // can be made based on it.
109  bool row_contiguous;
110 
111  local_ordinal_type max_partsz;
112  local_ordinal_type max_subpartsz;
113  local_ordinal_type n_subparts_per_part;
114  local_ordinal_type nparts;
115  };
116 
117  // Precompute offsets of each A and x entry to speed up residual.
118  // (Applies for hasBlockCrsMatrix == true and OverlapTag/AsyncTag)
119  // Reading A, x take up to 4, 6 levels of indirection respectively,
120  // but precomputing the offsets reduces it to 2 for both.
121  //
122  // This function allocates and populates these members of AmD:
123  // A_x_offsets, A_x_offsets_remote
124  template<typename MatrixType>
125  void precompute_A_x_offsets(
126  AmD<MatrixType> &amd,
127  const PartInterface<MatrixType> &interf,
128  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_crs_graph_type> &g,
129  const typename ImplType<MatrixType>::local_ordinal_type_1d_view &dm2cm,
130  int blocksize,
131  bool ownedRemoteSeparate)
132  {
133  using impl_type = ImplType<MatrixType>;
134  using i64_3d_view = typename impl_type::i64_3d_view;
135  using size_type = typename impl_type::size_type;
136  using local_ordinal_type = typename impl_type::local_ordinal_type;
137  using execution_space = typename impl_type::execution_space;
138  auto local_graph = g->getLocalGraphDevice();
139  const auto A_block_rowptr = local_graph.row_map;
140  const auto A_colind = local_graph.entries;
141  local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
142  int blocksize_square = blocksize * blocksize;
143  // shallow-copying views to avoid capturing the amd, interf objects in lambdas
144  auto lclrow = interf.lclrow;
145  auto A_colindsub = amd.A_colindsub;
146  auto A_colindsub_remote = amd.A_colindsub_remote;
147  auto rowptr = amd.rowptr;
148  auto rowptr_remote = amd.rowptr_remote;
149  bool is_dm2cm_active = dm2cm.extent(0);
150  if(ownedRemoteSeparate) {
151  // amd.rowptr points to owned entries only, and amd.rowptr_remote points to nonowned.
152  local_ordinal_type maxOwnedEntriesPerRow = 0;
153  local_ordinal_type maxNonownedEntriesPerRow = 0;
154  Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
155  KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type& lmaxOwned, local_ordinal_type& lmaxNonowned)
156  {
157  const local_ordinal_type lr = lclrow(i);
158  local_ordinal_type rowNumOwned = rowptr(lr+1) - rowptr(lr);
159  if(rowNumOwned > lmaxOwned)
160  lmaxOwned = rowNumOwned;
161  //rowptr_remote won't be allocated for single-rank problems
162  if(rowptr_remote.extent(0)) {
163  local_ordinal_type rowNumNonowned = rowptr_remote(lr+1) - rowptr_remote(lr);
164  if(rowNumNonowned > lmaxNonowned)
165  lmaxNonowned = rowNumNonowned;
166  }
167  else {
168  lmaxNonowned = 0;
169  }
170  }, Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
171  // Allocate the two offsets views now that we know the dimensions
172  // For each one, the middle dimension is 0 for A offsets and 1 for x offsets.
173  // Packing them together in one view improves cache line utilization
174  amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
175  amd.A_x_offsets_remote = i64_3d_view("amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
176  auto A_x_offsets = amd.A_x_offsets;
177  auto A_x_offsets_remote = amd.A_x_offsets_remote;
178  // Now, populate all the offsets. Use ArithTraits<int64_t>::min to mark absent entries.
179  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
180  KOKKOS_LAMBDA(local_ordinal_type i)
181  {
182  const local_ordinal_type lr = lclrow(i);
183  const size_type A_k0 = A_block_rowptr(lr);
184  // Owned entries
185  size_type rowBegin = rowptr(lr);
186  local_ordinal_type rowNumOwned = rowptr(lr+1) - rowBegin;
187  for(local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
188  if(entry < rowNumOwned) {
189  const size_type j = A_k0 + A_colindsub(rowBegin + entry);
190  const local_ordinal_type A_colind_at_j = A_colind(j);
191  const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
192  A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
193  A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
194  }
195  else {
196  A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
197  A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
198  }
199  }
200  // Nonowned entries
201  if(rowptr_remote.extent(0)) {
202  rowBegin = rowptr_remote(lr);
203  local_ordinal_type rowNumNonowned = rowptr_remote(lr+1) - rowBegin;
204  for(local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
205  if(entry < rowNumNonowned) {
206  const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
207  const local_ordinal_type A_colind_at_j = A_colind(j);
208  const local_ordinal_type loc = A_colind_at_j - numLocalRows;
209  A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
210  A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
211  }
212  else {
213  A_x_offsets_remote(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
214  A_x_offsets_remote(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
215  }
216  }
217  }
218  });
219  }
220  else {
221  // amd.rowptr points to both owned and nonowned entries, so it tells us how many columns (last dim) A_x_offsets should have.
222  local_ordinal_type maxEntriesPerRow = 0;
223  Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
224  KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type& lmax)
225  {
226  const local_ordinal_type lr = lclrow(i);
227  local_ordinal_type rowNum = rowptr(lr+1) - rowptr(lr);
228  if(rowNum > lmax)
229  lmax = rowNum;
230  }, Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
231  amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
232  auto A_x_offsets = amd.A_x_offsets;
233  //Populate A,x offsets. Use ArithTraits<int64_t>::min to mark absent entries.
234  //For x offsets, add a shift blocksize*numLocalRows to represent that it indexes into x_remote instead of x.
235  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
236  KOKKOS_LAMBDA(local_ordinal_type i)
237  {
238  const local_ordinal_type lr = lclrow(i);
239  const size_type A_k0 = A_block_rowptr(lr);
240  // Owned entries
241  size_type rowBegin = rowptr(lr);
242  local_ordinal_type rowOwned = rowptr(lr+1) - rowBegin;
243  for(local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
244  if(entry < rowOwned) {
245  const size_type j = A_k0 + A_colindsub(rowBegin + entry);
246  A_x_offsets(i, 0, entry) = j * blocksize_square;
247  const local_ordinal_type A_colind_at_j = A_colind(j);
248  if(A_colind_at_j < numLocalRows) {
249  const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
250  A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
251  }
252  else {
253  A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
254  }
255  }
256  else {
257  A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
258  A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
259  }
260  }
261  });
262  }
263  }
264 
268  static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
269  const int team_size) {
270  int total_team_size(0);
271  if (blksize <= 5) total_team_size = 32;
272  else if (blksize <= 9) total_team_size = 32; // 64
273  else if (blksize <= 12) total_team_size = 96;
274  else if (blksize <= 16) total_team_size = 128;
275  else if (blksize <= 20) total_team_size = 160;
276  else total_team_size = 160;
277  return total_team_size/team_size;
278  }
279 
280  static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
281  const int team_size) {
282  int total_team_size(0);
283  if (blksize <= 5) total_team_size = 32;
284  else if (blksize <= 9) total_team_size = 32; // 64
285  else if (blksize <= 12) total_team_size = 96;
286  else if (blksize <= 16) total_team_size = 128;
287  else if (blksize <= 20) total_team_size = 160;
288  else total_team_size = 160;
289  return total_team_size/team_size;
290  }
291 
292  static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
293  const int team_size) {
294  int total_team_size(0);
295  if (blksize <= 5) total_team_size = 32;
296  else if (blksize <= 9) total_team_size = 32; // 64
297  else if (blksize <= 12) total_team_size = 96;
298  else if (blksize <= 16) total_team_size = 128;
299  else if (blksize <= 20) total_team_size = 160;
300  else total_team_size = 160;
301  return total_team_size/team_size;
302  }
303 
304  template<typename T>
305  static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
306  const int team_size) {
307  if ( is_cuda<T>::value )
308  return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
309  if ( is_hip<T>::value )
310  return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
311  if ( is_sycl<T>::value )
312  return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
313  return -1;
314  }
315 
316  template<typename MatrixType>
317  struct ComputeResidualVector {
318  public:
319  using impl_type = BlockHelperDetails::ImplType<MatrixType>;
320  using node_device_type = typename impl_type::node_device_type;
321  using execution_space = typename impl_type::execution_space;
322  using memory_space = typename impl_type::memory_space;
323 
324  using local_ordinal_type = typename impl_type::local_ordinal_type;
325  using size_type = typename impl_type::size_type;
326  using impl_scalar_type = typename impl_type::impl_scalar_type;
327  using magnitude_type = typename impl_type::magnitude_type;
328  using btdm_scalar_type = typename impl_type::btdm_scalar_type;
329  using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
331  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
332  using size_type_1d_view = typename impl_type::size_type_1d_view;
333  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
334  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
335  using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
336  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
337  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
338  using i64_3d_view = typename impl_type::i64_3d_view;
339  static constexpr int vector_length = impl_type::vector_length;
340 
342  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
343 
344  // enum for max blocksize and vector length
345  enum : int { max_blocksize = 32 };
346 
347  private:
348  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
349  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
350  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
351  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
352  Unmanaged<vector_type_3d_view> y_packed;
353  Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
354 
355  // AmD information
356  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
357  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
358  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
359 
360  // block crs graph information
361  // for cuda (kokkos crs graph uses a different size_type from size_t)
362  const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_block_rowptr;
363  const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_point_rowptr;
364  const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
365 
366  // blocksize
367  const local_ordinal_type blocksize_requested;
368 
369  // part interface
370  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
371  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
372  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
373  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
374  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
375  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
376 
377  // block offsets
378  const ConstUnmanaged<i64_3d_view> A_x_offsets;
379  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
380 
381  const bool is_dm2cm_active;
382  const bool hasBlockCrsMatrix;
383 
384  public:
385  template<typename LocalCrsGraphType>
386  ComputeResidualVector(const AmD<MatrixType> &amd,
387  const LocalCrsGraphType &block_graph,
388  const LocalCrsGraphType &point_graph,
389  const local_ordinal_type &blocksize_requested_,
390  const PartInterface<MatrixType> &interf,
391  const local_ordinal_type_1d_view &dm2cm_,
392  bool hasBlockCrsMatrix_)
393  : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
394  colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
395  tpetra_values(amd.tpetra_values),
396  A_block_rowptr(block_graph.row_map),
397  A_point_rowptr(point_graph.row_map),
398  A_colind(block_graph.entries),
399  blocksize_requested(blocksize_requested_),
400  part2packrowidx0(interf.part2packrowidx0),
401  part2rowidx0(interf.part2rowidx0),
402  rowidx2part(interf.rowidx2part),
403  partptr(interf.partptr),
404  lclrow(interf.lclrow),
405  dm2cm(dm2cm_),
406  A_x_offsets(amd.A_x_offsets),
407  A_x_offsets_remote(amd.A_x_offsets_remote),
408  is_dm2cm_active(dm2cm_.span() > 0),
409  hasBlockCrsMatrix(hasBlockCrsMatrix_)
410  {}
411 
412  inline
413  void
414  SerialDot(const local_ordinal_type &blocksize,
415  const local_ordinal_type &lclRowID,
416  const local_ordinal_type &lclColID,
417  const local_ordinal_type &ii,
418  const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
419  const impl_scalar_type * const KOKKOS_RESTRICT xx,
420  /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
421  const size_type Aj_c = colindsub_(lclColID);
422  auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
423  impl_scalar_type val = 0;
424 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
425 # pragma ivdep
426 #endif
427 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
428 # pragma unroll
429 #endif
430  for (local_ordinal_type k1=0;k1<blocksize;++k1)
431  val += tpetra_values(point_row_offset + k1) * xx[k1];
432  yy[ii] -= val;
433  }
434 
435  inline
436  void
437  SerialGemv(const local_ordinal_type &blocksize,
438  const impl_scalar_type * const KOKKOS_RESTRICT AA,
439  const impl_scalar_type * const KOKKOS_RESTRICT xx,
440  /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
441  using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
442  for (local_ordinal_type k0=0;k0<blocksize;++k0) {
443  impl_scalar_type val = 0;
444 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
445 # pragma ivdep
446 #endif
447 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
448 # pragma unroll
449 #endif
450  for (local_ordinal_type k1=0;k1<blocksize;++k1)
451  val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
452  yy[k0] -= val;
453  }
454  }
455 
456  template<typename bbViewType, typename yyViewType>
457  KOKKOS_INLINE_FUNCTION
458  void
459  VectorCopy(const member_type &member,
460  const local_ordinal_type &blocksize,
461  const bbViewType &bb,
462  const yyViewType &yy) const {
463  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
464  yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
465  });
466  }
467 
468  template<typename xxViewType, typename yyViewType>
469  KOKKOS_INLINE_FUNCTION
470  void
471  VectorDot(const member_type &member,
472  const local_ordinal_type &blocksize,
473  const local_ordinal_type &lclRowID,
474  const local_ordinal_type &lclColID,
475  const local_ordinal_type &ii,
476  const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
477  const xxViewType &xx,
478  const yyViewType &yy) const {
479  const size_type Aj_c = colindsub_(lclColID);
480  auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
481  impl_scalar_type val = 0;
482  Kokkos::parallel_reduce
483  (Kokkos::ThreadVectorRange(member, blocksize),
484  [&](const local_ordinal_type &k1, impl_scalar_type& update) {
485  update += tpetra_values(point_row_offset + k1) *xx(k1);
486  }, val);
487  Kokkos::single(Kokkos::PerThread(member),
488  [=]()
489  {
490  Kokkos::atomic_add(&yy(ii), typename yyViewType::const_value_type(-val));
491  });
492  }
493 
494  // BMK: This version coalesces accesses to AA for LayoutRight blocks.
495  template<typename AAViewType, typename xxViewType, typename yyViewType>
496  KOKKOS_INLINE_FUNCTION
497  void
498  VectorGemv(const member_type &member,
499  const local_ordinal_type &blocksize,
500  const AAViewType &AA,
501  const xxViewType &xx,
502  const yyViewType &yy) const {
503  for (local_ordinal_type k0=0;k0<blocksize;++k0) {
504  impl_scalar_type val = 0;
505  Kokkos::parallel_reduce
506  (Kokkos::ThreadVectorRange(member, blocksize),
507  [&](const local_ordinal_type &k1, impl_scalar_type& update) {
508  update += AA(k0,k1)*xx(k1);
509  }, val);
510  Kokkos::single(Kokkos::PerThread(member),
511  [=]()
512  {
513  Kokkos::atomic_add(&yy(k0), -val);
514  });
515  }
516  }
517 
518  struct SeqTag {};
519 
520  KOKKOS_INLINE_FUNCTION
521  void
522  operator() (const SeqTag &, const local_ordinal_type& i) const {
523  const local_ordinal_type blocksize = blocksize_requested;
524  const local_ordinal_type blocksize_square = blocksize*blocksize;
525 
526  // constants
527  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
528  const local_ordinal_type num_vectors = y.extent(1);
529  const local_ordinal_type row = i*blocksize;
530  for (local_ordinal_type col=0;col<num_vectors;++col) {
531  // y := b
532  impl_scalar_type *yy = &y(row, col);
533  const impl_scalar_type * const bb = &b(row, col);
534  memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
535 
536  // y -= Rx
537  const size_type A_k0 = A_block_rowptr[i];
538  for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
539  const size_type j = A_k0 + colindsub[k];
540  const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
541  if (hasBlockCrsMatrix) {
542  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
543  SerialGemv(blocksize,AA,xx,yy);
544  }
545  else {
546  for (local_ordinal_type k0=0;k0<blocksize;++k0)
547  SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
548  }
549  }
550  }
551  }
552 
553  KOKKOS_INLINE_FUNCTION
554  void
555  operator() (const SeqTag &, const member_type &member) const {
556 
557  // constants
558  const local_ordinal_type blocksize = blocksize_requested;
559  const local_ordinal_type blocksize_square = blocksize*blocksize;
560 
561  const local_ordinal_type lr = member.league_rank();
562  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
563  const local_ordinal_type num_vectors = y.extent(1);
564 
565  // subview pattern
566  auto bb = Kokkos::subview(b, block_range, 0);
567  auto xx = bb;
568  auto yy = Kokkos::subview(y, block_range, 0);
569  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
570 
571  const local_ordinal_type row = lr*blocksize;
572  for (local_ordinal_type col=0;col<num_vectors;++col) {
573  // y := b
574  yy.assign_data(&y(row, col));
575  bb.assign_data(&b(row, col));
576  if (member.team_rank() == 0)
577  VectorCopy(member, blocksize, bb, yy);
578  member.team_barrier();
579 
580  // y -= Rx
581  const size_type A_k0 = A_block_rowptr[lr];
582 
583  if(hasBlockCrsMatrix) {
584  Kokkos::parallel_for
585  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
586  [&](const local_ordinal_type &k) {
587  const size_type j = A_k0 + colindsub[k];
588  xx.assign_data( &x(A_colind[j]*blocksize, col) );
589  A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
590  VectorGemv(member, blocksize, A_block_cst, xx, yy);
591  });
592  }
593  else {
594  Kokkos::parallel_for
595  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
596  [&](const local_ordinal_type &k) {
597 
598  const size_type j = A_k0 + colindsub[k];
599  xx.assign_data( &x(A_colind[j]*blocksize, col) );
600 
601  for (local_ordinal_type k0=0;k0<blocksize;++k0)
602  VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
603  });
604  }
605  }
606  }
607 
608  // * B: block size for compile-time specialization, or 0 for general case (up to max_blocksize)
609  // * async: true if using async importer. overlap is not used in this case.
610  // Whether a column is owned or nonowned is decided at runtime.
611  // * overlap: true if processing the columns that are not locally owned,
612  // false if processing locally owned columns.
613  // * haveBlockMatrix: true if A is a BlockCrsMatrix, false if it's CrsMatrix.
614  template<int B, bool async, bool overlap, bool haveBlockMatrix>
615  struct GeneralTag
616  {
617  static_assert(!(async && overlap),
618  "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
619  };
620 
621  // Define AsyncTag and OverlapTag in terms of GeneralTag:
622  // P == 0 means only compute on owned columns
623  // P == 1 means only compute on nonowned columns
624  template<int P, int B, bool haveBlockMatrix>
625  using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
626 
627  template<int B, bool haveBlockMatrix>
628  using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
629 
630  // CPU implementation for all cases
631  template<int B, bool async, bool overlap, bool haveBlockMatrix>
632  KOKKOS_INLINE_FUNCTION
633  void
634  operator() (const GeneralTag<B, async, overlap, haveBlockMatrix>&, const local_ordinal_type &rowidx) const {
635  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
636 
637  // constants
638  const local_ordinal_type partidx = rowidx2part(rowidx);
639  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
640  const local_ordinal_type v = partidx % vector_length;
641 
642  const local_ordinal_type num_vectors = y_packed.extent(2);
643  const local_ordinal_type num_local_rows = lclrow.extent(0);
644 
645  // temporary buffer for y flat
646  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
647 
648  const local_ordinal_type lr = lclrow(rowidx);
649 
650  auto colindsub_used = overlap ? colindsub_remote : colindsub;
651  auto rowptr_used = overlap ? rowptr_remote : rowptr;
652 
653  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
654  if constexpr(overlap) {
655  // y (temporary) := 0
656  memset((void*) yy, 0, sizeof(impl_scalar_type)*blocksize);
657  }
658  else {
659  // y := b
660  const local_ordinal_type row = lr*blocksize;
661  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
662  }
663 
664  // y -= Rx
665  const size_type A_k0 = A_block_rowptr[lr];
666  for (size_type k = rowptr_used[lr]; k < rowptr_used[lr+1]; ++k) {
667  const size_type j = A_k0 + colindsub_used[k];
668  const local_ordinal_type A_colind_at_j = A_colind[j];
669  if constexpr (haveBlockMatrix) {
670  const local_ordinal_type blocksize_square = blocksize*blocksize;
671  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
672  if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
673  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
674  const impl_scalar_type * const xx = &x(loc*blocksize, col);
675  SerialGemv(blocksize, AA,xx,yy);
676  } else {
677  const auto loc = A_colind_at_j - num_local_rows;
678  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
679  SerialGemv(blocksize, AA,xx_remote,yy);
680  }
681  }
682  else {
683  if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
684  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
685  const impl_scalar_type * const xx = &x(loc*blocksize, col);
686  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
687  SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
688  } else {
689  const auto loc = A_colind_at_j - num_local_rows;
690  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
691  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
692  SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
693  }
694  }
695  }
696  // move yy to y_packed
697  if constexpr(overlap) {
698  for (local_ordinal_type k=0;k<blocksize;++k)
699  y_packed(pri, k, col)[v] += yy[k];
700  }
701  else {
702  for (local_ordinal_type k=0;k<blocksize;++k)
703  y_packed(pri, k, col)[v] = yy[k];
704  }
705  }
706  }
707 
708  // GPU implementation for hasBlockCrsMatrix == true
709  template<int B, bool async, bool overlap>
710  KOKKOS_INLINE_FUNCTION
711  void
712  operator() (const GeneralTag<B, async, overlap, true>&, const member_type &member) const {
713  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
714 
715  // constants
716  const local_ordinal_type rowidx = member.league_rank();
717  const local_ordinal_type partidx = rowidx2part(rowidx);
718  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
719  const local_ordinal_type v = partidx % vector_length;
720 
721  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
722  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
723  const local_ordinal_type num_local_rows = lclrow.extent(0);
724 
725  // subview pattern
726  using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
727  using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
728  subview_1D_right_t bb(nullptr, blocksize);
729  subview_1D_right_t xx(nullptr, blocksize);
730  subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
731  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
732 
733  // Get shared allocation for a local copy of x, Ax, and A
734  impl_scalar_type * local_Ax = reinterpret_cast<impl_scalar_type *>(member.team_scratch(0).get_shmem(blocksize*sizeof(impl_scalar_type)));
735  impl_scalar_type * local_x = reinterpret_cast<impl_scalar_type *>(member.thread_scratch(0).get_shmem(blocksize*sizeof(impl_scalar_type)));
736 
737  const local_ordinal_type lr = lclrow(rowidx);
738  const local_ordinal_type row = lr*blocksize;
739  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
740  if(col)
741  member.team_barrier();
742  // y -= Rx
743  // Initialize accumulation array
744  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),[&](const local_ordinal_type & i){
745  local_Ax[i] = 0;
746  });
747  member.team_barrier();
748 
749  int numEntries;
750  if constexpr(!overlap) {
751  numEntries = A_x_offsets.extent(2);
752  }
753  else {
754  numEntries = A_x_offsets_remote.extent(2);
755  }
756 
757  Kokkos::parallel_for
758  (Kokkos::TeamThreadRange(member, 0, numEntries),
759  [&](const int k) {
760  int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
761  int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
762  if(A_offset != Kokkos::ArithTraits<int64_t>::min()) {
763  A_block_cst.assign_data(tpetra_values.data() + A_offset);
764  // Pull x into local memory
765  if constexpr(async) {
766  size_type remote_cutoff = blocksize * num_local_rows;
767  if(x_offset >= remote_cutoff)
768  xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
769  else
770  xx.assign_data(&x(x_offset, col));
771  }
772  else {
773  if constexpr(!overlap) {
774  xx.assign_data(&x(x_offset, col));
775  }
776  else {
777  xx.assign_data(&x_remote(x_offset, col));
778  }
779  }
780 
781  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),[&](const local_ordinal_type & i){
782  local_x[i] = xx(i);
783  });
784 
785  // MatVec op Ax += A*x
786  Kokkos::parallel_for(
787  Kokkos::ThreadVectorRange(member, blocksize),
788  [&](const local_ordinal_type &k0) {
789  impl_scalar_type val = 0;
790  for(int k1=0; k1<blocksize; k1++)
791  val += A_block_cst(k0,k1)*local_x[k1];
792  Kokkos::atomic_add(local_Ax+k0, val);
793  });
794  }
795  });
796  member.team_barrier();
797  // Update y = b - local_Ax
798  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
799  bb.assign_data(&b(row, col));
800  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),[&](const local_ordinal_type & i){
801  if (!overlap)
802  yy(i) = bb(i) - local_Ax[i];
803  else
804  yy(i) -= local_Ax[i];
805  });
806  }
807  }
808 
809  // GPU implementation for hasBlockCrsMatrix == false
810  template<int B, bool async, bool overlap>
811  KOKKOS_INLINE_FUNCTION
812  void
813  operator() (const GeneralTag<B, async, overlap, false>&, const member_type &member) const {
814  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
815 
816  // constants
817  const local_ordinal_type rowidx = member.league_rank();
818  const local_ordinal_type partidx = rowidx2part(rowidx);
819  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
820  const local_ordinal_type v = partidx % vector_length;
821 
822  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
823  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
824  const local_ordinal_type num_local_rows = lclrow.extent(0);
825 
826  // subview pattern
827  using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
828  using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
829  subview_1D_right_t bb(nullptr, blocksize);
830  subview_1D_right_t xx(nullptr, blocksize);
831  subview_1D_right_t xx_remote(nullptr, blocksize);
832  subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
833  auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
834  auto colindsub_used = overlap ? colindsub_remote : colindsub;
835  auto rowptr_used = overlap ? rowptr_remote : rowptr;
836 
837  const local_ordinal_type lr = lclrow(rowidx);
838  const local_ordinal_type row = lr*blocksize;
839  for (local_ordinal_type col = 0; col < num_vectors; ++col) {
840  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
841  if (!overlap) {
842  // y := b
843  bb.assign_data(&b(row, col));
844  if (member.team_rank() == 0)
845  VectorCopy(member, blocksize, bb, yy);
846  member.team_barrier();
847  }
848 
849  // y -= Rx
850  const size_type A_k0 = A_block_rowptr[lr];
851  Kokkos::parallel_for
852  (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
853  [&](const local_ordinal_type &k) {
854  const size_type j = A_k0 + colindsub_used[k];
855  const local_ordinal_type A_colind_at_j = A_colind[j];
856  if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
857  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
858  xx.assign_data( &x(loc*blocksize, col) );
859  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
860  VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
861  } else {
862  const auto loc = A_colind_at_j - num_local_rows;
863  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
864  for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
865  VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
866  }
867  });
868  }
869  }
870 
871  // y = b - Rx; seq method
872  template<typename MultiVectorLocalViewTypeY,
873  typename MultiVectorLocalViewTypeB,
874  typename MultiVectorLocalViewTypeX>
875  void run(const MultiVectorLocalViewTypeY &y_,
876  const MultiVectorLocalViewTypeB &b_,
877  const MultiVectorLocalViewTypeX &x_) {
878  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
879  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
880 
881  y = y_; b = b_; x = x_;
882  if constexpr (is_device<execution_space>::value) {
883  const local_ordinal_type blocksize = blocksize_requested;
884  const local_ordinal_type team_size = 8;
885  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
886  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
887  Kokkos::parallel_for
888  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
889  } else {
890  const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
891  Kokkos::parallel_for
892  ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
893  }
894  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
895  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
896  }
897 
898  // y = b - R (x , x_remote)
899  template<typename MultiVectorLocalViewTypeB,
900  typename MultiVectorLocalViewTypeX,
901  typename MultiVectorLocalViewTypeX_Remote>
902  void run(const vector_type_3d_view &y_packed_,
903  const MultiVectorLocalViewTypeB &b_,
904  const MultiVectorLocalViewTypeX &x_,
905  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
906  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
907  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
908 
909  b = b_; x = x_; x_remote = x_remote_;
910  if constexpr (is_device<execution_space>::value) {
911  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
912  y_packed_.extent(0),
913  y_packed_.extent(1),
914  y_packed_.extent(2),
915  vector_length);
916  } else {
917  y_packed = y_packed_;
918  }
919 
920  if constexpr(is_device<execution_space>::value) {
921  const local_ordinal_type blocksize = blocksize_requested;
922  // local_ordinal_type vl_power_of_two = 1;
923  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
924  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
925  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
926 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
927  if(this->hasBlockCrsMatrix) { \
928  const local_ordinal_type team_size = 8; \
929  const local_ordinal_type vector_size = 8; \
930  const size_t shmem_team_size = blocksize*sizeof(btdm_scalar_type); \
931  const size_t shmem_thread_size = blocksize*sizeof(btdm_scalar_type); \
932  Kokkos::TeamPolicy<execution_space,AsyncTag<B, true> > \
933  policy(rowidx2part.extent(0), team_size, vector_size); \
934  policy.set_scratch_size(0,Kokkos::PerTeam(shmem_team_size),Kokkos::PerThread(shmem_thread_size)); \
935  Kokkos::parallel_for \
936  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
937  policy, *this); \
938  } \
939  else { \
940  const local_ordinal_type team_size = 8; \
941  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
942  const Kokkos::TeamPolicy<execution_space,AsyncTag<B, false> > \
943  policy(rowidx2part.extent(0), team_size, vector_size); \
944  Kokkos::parallel_for \
945  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
946  policy, *this); \
947  } \
948  } break
949  switch (blocksize_requested) {
950  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
951  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
952  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
953  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
954  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
955  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
956  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
957  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
958  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
959  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
960  }
961 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
962  } else {
963 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
964  if(this->hasBlockCrsMatrix) { \
965  const Kokkos::RangePolicy<execution_space,AsyncTag<B, true> > policy(0, rowidx2part.extent(0)); \
966  Kokkos::parallel_for \
967  ("ComputeResidual::RangePolicy::run<AsyncTag>", \
968  policy, *this); \
969  } \
970  else { \
971  const Kokkos::RangePolicy<execution_space,AsyncTag<B, false> > policy(0, rowidx2part.extent(0)); \
972  Kokkos::parallel_for \
973  ("ComputeResidual::RangePolicy::run<AsyncTag>", \
974  policy, *this); \
975  } \
976  } break
977 
978  switch (blocksize_requested) {
979  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
980  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
981  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
982  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
983  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
984  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
985  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
986  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
987  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
988  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
989  }
990 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
991  }
992  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
993  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
994  }
995 
996  // y = b - R (y , y_remote)
997  template<typename MultiVectorLocalViewTypeB,
998  typename MultiVectorLocalViewTypeX,
999  typename MultiVectorLocalViewTypeX_Remote>
1000  void run(const vector_type_3d_view &y_packed_,
1001  const MultiVectorLocalViewTypeB &b_,
1002  const MultiVectorLocalViewTypeX &x_,
1003  const MultiVectorLocalViewTypeX_Remote &x_remote_,
1004  const bool compute_owned) {
1005  IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
1006  IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
1007 
1008  b = b_; x = x_; x_remote = x_remote_;
1009  if constexpr (is_device<execution_space>::value) {
1010  y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
1011  y_packed_.extent(0),
1012  y_packed_.extent(1),
1013  y_packed_.extent(2),
1014  vector_length);
1015  } else {
1016  y_packed = y_packed_;
1017  }
1018 
1019  if constexpr (is_device<execution_space>::value) {
1020  const local_ordinal_type blocksize = blocksize_requested;
1021  // local_ordinal_type vl_power_of_two = 1;
1022  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
1023  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
1024  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
1025 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1026  if(this->hasBlockCrsMatrix) { \
1027  const local_ordinal_type team_size = 8; \
1028  const local_ordinal_type vector_size = 8; \
1029  const size_t shmem_team_size = blocksize*sizeof(btdm_scalar_type); \
1030  const size_t shmem_thread_size = blocksize*sizeof(btdm_scalar_type); \
1031  if (compute_owned) { \
1032  Kokkos::TeamPolicy<execution_space,OverlapTag<0, B, true> > \
1033  policy(rowidx2part.extent(0), team_size, vector_size); \
1034  policy.set_scratch_size(0,Kokkos::PerTeam(shmem_team_size),Kokkos::PerThread(shmem_thread_size)); \
1035  Kokkos::parallel_for \
1036  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1037  } else { \
1038  Kokkos::TeamPolicy<execution_space,OverlapTag<1, B, true> > \
1039  policy(rowidx2part.extent(0), team_size, vector_size); \
1040  policy.set_scratch_size(0,Kokkos::PerTeam(shmem_team_size),Kokkos::PerThread(shmem_thread_size)); \
1041  Kokkos::parallel_for \
1042  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1043  } \
1044  } \
1045  else { \
1046  const local_ordinal_type team_size = 8; \
1047  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
1048  if (compute_owned) { \
1049  const Kokkos::TeamPolicy<execution_space,OverlapTag<0, B, false> > \
1050  policy(rowidx2part.extent(0), team_size, vector_size); \
1051  Kokkos::parallel_for \
1052  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1053  } else { \
1054  const Kokkos::TeamPolicy<execution_space,OverlapTag<1, B, false> > \
1055  policy(rowidx2part.extent(0), team_size, vector_size); \
1056  Kokkos::parallel_for \
1057  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1058  } \
1059  } \
1060  break
1061  switch (blocksize_requested) {
1062  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
1063  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
1064  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
1065  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
1066  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1067  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1068  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1069  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1070  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1071  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
1072  }
1073 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1074  } else {
1075 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1076  if(this->hasBlockCrsMatrix) { \
1077  if (compute_owned) { \
1078  const Kokkos::RangePolicy<execution_space,OverlapTag<0, B, true>> \
1079  policy(0, rowidx2part.extent(0)); \
1080  Kokkos::parallel_for \
1081  ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1082  } else { \
1083  const Kokkos::RangePolicy<execution_space,OverlapTag<1, B, true>> \
1084  policy(0, rowidx2part.extent(0)); \
1085  Kokkos::parallel_for \
1086  ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1087  } \
1088  } \
1089  else { \
1090  if (compute_owned) { \
1091  const Kokkos::RangePolicy<execution_space,OverlapTag<0, B, false>> \
1092  policy(0, rowidx2part.extent(0)); \
1093  Kokkos::parallel_for \
1094  ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1095  } else { \
1096  const Kokkos::RangePolicy<execution_space,OverlapTag<1, B, false>> \
1097  policy(0, rowidx2part.extent(0)); \
1098  Kokkos::parallel_for \
1099  ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1100  } \
1101  } \
1102  break
1103 
1104  switch (blocksize_requested) {
1105  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
1106  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
1107  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
1108  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
1109  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1110  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1111  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1112  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1113  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1114  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
1115  }
1116 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1117  }
1118  IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1119  IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
1120  }
1121  };
1122 
1123 
1124  } // namespace BlockHelperDetails
1125 
1126 } // namespace Ifpack2
1127 
1128 #endif
node_type::device_type node_device_type
Definition: Ifpack2_BlockHelper.hpp:276
size_t size_type
Definition: Ifpack2_BlockHelper.hpp:253
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockHelper.hpp:262
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockHelper.hpp:320
Definition: Ifpack2_BlockHelper.hpp:249
Definition: Ifpack2_BlockComputeResidualVector.hpp:23